diff options
Diffstat (limited to 'net/netfilter/ipvs')
28 files changed, 3647 insertions, 3594 deletions
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 70bd1d0774c..0c3b1670b0d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -28,12 +28,11 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" depends on IPV6 = y || IP_VS = IPV6 + select IP6_NF_IPTABLES ---help--- - Add IPv6 support to IPVS. This is incomplete and might be dangerous. + Add IPv6 support to IPVS. - See http://www.mindbasket.com/ipvs for more information. - - Say N if unsure. + Say Y if unsure. config IP_VS_DEBUG bool "IP virtual server debugging" @@ -122,7 +121,6 @@ config IP_VS_RR config IP_VS_WRR tristate "weighted round-robin scheduling" - select GCD ---help--- The weighted robin-robin scheduling algorithm directs network connections to different real servers based on server weights @@ -232,11 +230,27 @@ config IP_VS_NQ If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +comment 'IPVS SH scheduler' + +config IP_VS_SH_TAB_BITS + int "IPVS source hashing table size (the Nth power of 2)" + range 4 20 + default 8 + ---help--- + The source hashing scheduler maps source IPs to destinations + stored in a hash table. This table is tiled by each destination + until all slots in the table are filled. When using weights to + allow destinations to receive more connections, the table is + tiled an amount proportional to the weights specified. The table + needs to be large enough to effectively fit all the destinations + multiplied by their respective weights. + comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ + NF_CONNTRACK_FTP select IP_VS_NFCT ---help--- FTP is a protocol that transfers IP address and/or port number in diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 059af3120be..dfd7b65b3d2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -31,7 +31,6 @@ #include <net/net_namespace.h> #include <net/protocol.h> #include <net/tcp.h> -#include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -59,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) module_put(app->module); } +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} /* * Allocate/initialize app incarnation and register it in proto apps. @@ -107,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, return 0; out: - kfree(inc->timeout_table); - kfree(inc); + ip_vs_app_inc_destroy(inc); return ret; } @@ -132,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) list_del(&inc->a_list); - kfree(inc->timeout_table); - kfree(inc); + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); } @@ -145,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) { int result; - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); return result; } @@ -157,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) */ void ip_vs_app_inc_put(struct ip_vs_app *inc) { - ip_vs_app_put(inc->app); atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); } @@ -181,45 +190,71 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, } -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct net *net, struct ip_vs_app *app) +/* Register application for netns */ +struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) { struct netns_ipvs *ipvs = net_ipvs(net); - /* increase the module use count */ - ip_vs_use_count_inc(); + struct ip_vs_app *a; + int err = 0; + + if (!ipvs) + return ERR_PTR(-ENOENT); mutex_lock(&__ip_vs_app_mutex); - list_add(&app->a_list, &ipvs->app_list); + list_for_each_entry(a, &ipvs->app_list, a_list) { + if (!strcmp(app->name, a->name)) { + err = -EEXIST; + goto out_unlock; + } + } + a = kmemdup(app, sizeof(*app), GFP_KERNEL); + if (!a) { + err = -ENOMEM; + goto out_unlock; + } + INIT_LIST_HEAD(&a->incs_list); + list_add(&a->a_list, &ipvs->app_list); + /* increase the module use count */ + ip_vs_use_count_inc(); +out_unlock: mutex_unlock(&__ip_vs_app_mutex); - return 0; + return err ? ERR_PTR(err) : a; } /* * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct ip_vs_app *inc, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_app *a, *anxt, *inc, *nxt; + + if (!ipvs) + return; mutex_lock(&__ip_vs_app_mutex); - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(net, inc); - } + list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { + if (app && strcmp(app->name, a->name)) + continue; + list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { + ip_vs_app_inc_release(net, inc); + } - list_del(&app->a_list); + list_del(&a->a_list); + kfree(a); - mutex_unlock(&__ip_vs_app_mutex); + /* decrease the module use count */ + ip_vs_use_count_dec(); + } - /* decrease the module use count */ - ip_vs_use_count_dec(); + mutex_unlock(&__ip_vs_app_mutex); } @@ -314,17 +349,17 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) * Assumes already checked proto==IPPROTO_TCP and diff!=0. */ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) + unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { vseq->previous_delta = vseq->delta; vseq->delta += diff; vseq->init_seq = seq; cp->flags |= flag; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, @@ -576,26 +611,17 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -int __net_init __ip_vs_app_init(struct net *net) +int __net_init ip_vs_app_net_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - -void __net_exit __ip_vs_app_cleanup(struct net *net) -{ - proc_net_remove(net, "ip_vs_app"); -} - -int __init ip_vs_app_init(void) -{ + proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } - -void ip_vs_app_cleanup(void) +void __net_exit ip_vs_app_net_cleanup(struct net *net) { + unregister_ip_vs_app(net, NULL /* all */); + remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index bf28ac2fc99..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -79,58 +79,28 @@ static unsigned int ip_vs_conn_rnd __read_mostly; struct ip_vs_aligned_lock { - rwlock_t l; + spinlock_t l; } __attribute__((__aligned__(SMP_CACHE_BYTES))); /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned key) +static inline void ct_write_lock_bh(unsigned int key) { - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_read_unlock(unsigned key) +static inline void ct_write_unlock_bh(unsigned int key) { - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned key) -{ - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned key) -{ - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock_bh(unsigned key) -{ - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock_bh(unsigned key) -{ - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto, +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { @@ -188,7 +158,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) @@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* Hash by protocol, client address and port */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pF\n", @@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); return ret; } @@ -220,21 +190,21 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. + * returns bool success. Caller should hold conn reference. */ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; /* unhash it and decrease its reference counter */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - hlist_del(&cp->c_list); + hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; atomic_dec(&cp->refcnt); ret = 1; @@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) ret = 0; spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret; + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + ret = false; + /* Decrease refcnt and unlink conn only if we are last user */ + if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } else + ret = atomic_read(&cp->refcnt) ? false : true; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); return ret; } @@ -257,30 +257,30 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) static inline struct ip_vs_conn * __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; - struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->cport == cp->cport && p->vport == cp->vport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); return cp; } } - ct_read_unlock(hash); + rcu_read_unlock(); return NULL; } @@ -308,13 +308,12 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) static int ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, - unsigned int proto_off, int inverse, - struct ip_vs_conn_param *p) + int inverse, struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; struct net *net = skb_net(skb); - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return 1; @@ -329,12 +328,11 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) return NULL; return ip_vs_conn_in_get(&p); @@ -344,20 +342,21 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; - struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (!ip_vs_conn_net_eq(cp, p->net)) - continue; - if (p->pe_data && p->pe->ct_match) { - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) - goto out; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } continue; } @@ -367,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr is a fwmark */ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, p->vaddr, &cp->vaddr) && - p->cport == cp->cport && p->vport == cp->vport && + p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol) - goto out; + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (__ip_vs_conn_get(cp)) + goto out; + } } cp = NULL; out: - if (cp) - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -394,32 +394,32 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr, p->vport: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp, *ret=NULL; - struct hlist_node *n; /* * Check for "full" addressed entries */ hash = ip_vs_conn_hashkey_param(p, true); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->vport == cp->cport && p->cport == cp->dport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport == cp->cport && p->cport == cp->dport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); ret = cp; break; } } - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -432,12 +432,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) return NULL; return ip_vs_conn_out_get(&p); @@ -463,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->flags & IP_VS_CONN_F_NO_CPORT) { atomic_dec(&ip_vs_conn_no_cport_cnt); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* hash on new dport */ ip_vs_conn_hash(cp); @@ -548,28 +547,31 @@ static inline void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) { unsigned int conn_flags; + __u32 flags; /* if dest is NULL, then return directly */ if (!dest) return; /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); conn_flags = atomic_read(&dest->conn_flags); if (cp->protocol != IPPROTO_UDP) conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; /* Bind with the destination and its corresponding transmitter */ - if (cp->flags & IP_VS_CONN_F_SYNC) { + if (flags & IP_VS_CONN_F_SYNC) { /* if the connection is not template and is created * by sync, preserve the activity flag. */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; /* connections inherit forwarding method from dest */ - cp->flags &= ~IP_VS_CONN_F_FWD_MASK; + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); } - cp->flags |= conn_flags; + flags |= conn_flags; + cp->flags = flags; cp->dest = dest; IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " @@ -584,12 +586,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_read(&dest->refcnt)); /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) atomic_inc(&dest->activeconns); else atomic_inc(&dest->inactconns); @@ -609,18 +611,46 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) * Check if there is a destination for the connection, if so * bind the connection to the destination. */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, - cp->dport, &cp->vaddr, cp->vport, - cp->protocol, cp->fwmark); + rcu_read_lock(); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock_bh(&cp->lock); + if (cp->dest) { + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; + spin_unlock_bh(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + rcu_read_unlock(); } @@ -672,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; } - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); + ip_vs_dest_put(dest); } static int expire_quiescent_template(struct netns_ipvs *ipvs, @@ -734,23 +759,27 @@ int ip_vs_check_template(struct ip_vs_conn *ct) * Simply decrease the refcnt of the template, * don't restart its timer. */ - atomic_dec(&ct->refcnt); + __ip_vs_conn_put(ct); return 0; } return 1; } -static void ip_vs_conn_expire(unsigned long data) +static void ip_vs_conn_rcu_free(struct rcu_head *head) { - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); - cp->timeout = 60*HZ; + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct net *net = ip_vs_conn_net(cp); + struct netns_ipvs *ipvs = net_ipvs(net); /* * do I control anybody? @@ -758,56 +787,60 @@ static void ip_vs_conn_expire(unsigned long data) if (atomic_read(&cp->n_control)) goto expire_later; - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); + del_timer(&cp->timer); /* does anybody control me? */ if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) - ip_vs_conn_drop_conntrack(cp); + if (cp->flags & IP_VS_CONN_F_NFCT) { + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } - ip_vs_pe_put(cp->pe); - kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); return; } - /* hash it back to the table */ - ip_vs_conn_hash(cp); - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt), atomic_read(&cp->n_control)); + atomic_inc(&cp->refcnt); + cp->timeout = 60*HZ; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_conn_put(cp); } - +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); } @@ -816,7 +849,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) */ struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; @@ -824,7 +857,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, p->protocol); - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { IP_VS_ERR_RL("%s(): no memory\n", __func__); return NULL; @@ -835,13 +868,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_conn_net_set(cp, p->net); cp->af = p->af; cp->protocol = p->protocol; - ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->vaddr, p->vaddr); cp->vport = p->vport; - /* proto should only be IPPROTO_IP if d_addr is a fwmark */ - ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + ip_vs_addr_set(p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; @@ -850,6 +883,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; } spin_lock_init(&cp->lock); @@ -860,19 +897,30 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, */ atomic_set(&cp->refcnt, 1); + cp->control = NULL; atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); /* Bind the connection with a destination server */ + cp->dest = NULL; ip_vs_bind_dest(cp, dest); /* Set its state and timeout */ cp->state = 0; + cp->old_state = 0; cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 @@ -915,27 +963,30 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) int idx; struct ip_vs_conn *cp; struct ip_vs_iter_state *iter = seq->private; - struct hlist_node *n; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ if (pos-- == 0) { iter->l = &ip_vs_conn_tab[idx]; return cp; } } - ct_read_unlock_bh(idx); + cond_resched_rcu(); } return NULL; } static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct ip_vs_iter_state *iter = seq->private; iter->l = NULL; + rcu_read_lock(); return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } @@ -952,31 +1003,26 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ip_vs_conn_array(seq, 0); /* more on same hash chain? */ - if ((e = cp->c_list.next)) + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) return hlist_entry(e, struct ip_vs_conn, c_list); idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - while (++idx < ip_vs_conn_tab_size) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { iter->l = &ip_vs_conn_tab[idx]; return cp; } - ct_read_unlock_bh(idx); + cond_resched_rcu(); } iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct ip_vs_iter_state *iter = seq->private; - struct hlist_head *l = iter->l; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); + rcu_read_unlock(); } static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) @@ -1049,7 +1095,7 @@ static const struct file_operations ip_vs_conn_fops = { .release = seq_release_net, }; -static const char *ip_vs_origin_name(unsigned flags) +static const char *ip_vs_origin_name(unsigned int flags) { if (flags & IP_VS_CONN_F_SYNC) return "SYNC"; @@ -1155,21 +1201,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; + rcu_read_lock(); /* * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned hash = net_random() & ip_vs_conn_tab_mask; - struct hlist_node *n; + unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; @@ -1189,6 +1230,18 @@ void ip_vs_random_dropentry(struct net *net) default: continue; } + } else if (cp->protocol == IPPROTO_SCTP) { + switch (cp->state) { + case IP_VS_SCTP_S_INIT1: + case IP_VS_SCTP_S_INIT: + break; + case IP_VS_SCTP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + default: + continue; + } } else { if (!todrop_entry(cp)) continue; @@ -1196,13 +1249,17 @@ void ip_vs_random_dropentry(struct net *net) IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(hash); + cond_resched_rcu(); } + rcu_read_unlock(); } @@ -1212,30 +1269,29 @@ void ip_vs_random_dropentry(struct net *net) static void ip_vs_conn_flush(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; struct netns_ipvs *ipvs = net_ipvs(net); flush_again: + rcu_read_lock(); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - struct hlist_node *n; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(idx); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (!ip_vs_conn_net_eq(cp, net)) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(idx); + cond_resched_rcu(); } + rcu_read_unlock(); /* the counter may be not NULL, because maybe some conn entries are run by slow timer handler or unhashed but still referred */ @@ -1247,23 +1303,23 @@ flush_again: /* * per netns init and exit */ -int __net_init __ip_vs_conn_init(struct net *net) +int __net_init ip_vs_conn_net_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); atomic_set(&ipvs->conn_count, 0); - proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); return 0; } -void __net_exit __ip_vs_conn_cleanup(struct net *net) +void __net_exit ip_vs_conn_net_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); - proc_net_remove(net, "ip_vs_conn"); - proc_net_remove(net, "ip_vs_conn_sync"); + remove_proc_entry("ip_vs_conn", net->proc_net); + remove_proc_entry("ip_vs_conn_sync", net->proc_net); } int __init ip_vs_conn_init(void) @@ -1301,7 +1357,7 @@ int __init ip_vs_conn_init(void) INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); } /* calculate the random value for connection hash */ @@ -1312,6 +1368,8 @@ int __init ip_vs_conn_init(void) void ip_vs_conn_cleanup(void) { + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index bfa808f4da1..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif -int ip_vs_net_id __read_mostly; -#ifdef IP_VS_GENERIC_NETNS -EXPORT_SYMBOL(ip_vs_net_id); -#endif +static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); @@ -80,7 +77,7 @@ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) -const char *ip_vs_proto_name(unsigned proto) +const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; @@ -100,7 +97,7 @@ const char *ip_vs_proto_name(unsigned proto) return "ICMPv6"; #endif default: - sprintf(buf, "IP_%d", proto); + sprintf(buf, "IP_%u", proto); return buf; } } @@ -119,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); s->ustats.inpkts++; @@ -126,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(dest->svc->stats.cpustats); + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); + rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.inpkts++; @@ -149,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); s->ustats.outpkts++; @@ -156,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(dest->svc->stats.cpustats); + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); + rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.outpkts++; @@ -188,14 +193,13 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) } -static inline int +static inline void ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { - if (unlikely(!pd->pp->state_transition)) - return 0; - return pd->pp->state_transition(cp, direction, skb, pd); + if (likely(pd->pp->state_transition)) + pd->pp->state_transition(cp, direction, skb, pd); } static inline int @@ -207,7 +211,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, { ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, vport, p); - p->pe = svc->pe; + p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); @@ -223,33 +227,32 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, - struct sk_buff *skb, - __be16 src_port, __be16 dst_port, int *ignored) + struct sk_buff *skb, __be16 src_port, __be16 dst_port, + int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport = 0; /* destination port to forward */ unsigned int flags; struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + (__force __u32) svc->netmask); else #endif - snet.ip = iph.saddr.ip & svc->netmask; + snet.ip = iph->saddr.ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -266,9 +269,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * is created for other persistent services. */ { - int protocol = iph.protocol; - const union nf_inet_addr *vaddr = &iph.daddr; - const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + int protocol = iph->protocol; + const union nf_inet_addr *vaddr = &iph->daddr; __be16 vport = 0; if (dst_port == svc->port) { @@ -303,12 +305,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { + struct ip_vs_scheduler *sched; + /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -343,14 +348,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dport = dest->port; flags = (svc->flags & IP_VS_SVC_F_ONEPACKET - && iph.protocol == IPPROTO_UDP)? + && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, - src_port, &iph.daddr, dst_port, ¶m); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, + src_port, &iph->daddr, dst_port, ¶m); cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { @@ -393,18 +398,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_proto_data *pd, int *ignored) + struct ip_vs_proto_data *pd, int *ignored, + struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; + struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; unsigned int flags; *ignored = 1; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + /* + * IPv6 frags, only the first hit here. + */ + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return NULL; @@ -424,7 +432,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { + (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); @@ -435,7 +443,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + iph); *ignored = 0; @@ -450,14 +459,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return NULL; } - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } flags = (svc->flags & IP_VS_SVC_F_ONEPACKET - && iph.protocol == IPPROTO_UDP)? + && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* @@ -466,9 +476,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, - &iph.saddr, pptr[0], &iph.daddr, pptr[1], - &p); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], &iph->daddr, + pptr[1], &p); cp = ip_vs_conn_new(&p, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); @@ -497,21 +507,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_proto_data *pd) + struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { __be16 _ports[2], *pptr; - struct ip_vs_iphdr iph; #ifdef CONFIG_SYSCTL struct net *net; struct netns_ipvs *ipvs; int unicast; #endif - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) { - ip_vs_service_put(svc); return NF_DROP; } @@ -520,32 +526,30 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; else #endif - unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); + unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ ipvs = net_ipvs(net); if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { - int ret, cs; + int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && - iph.protocol == IPPROTO_UDP)? + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - ip_vs_service_put(svc); - /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], &p); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); @@ -557,10 +561,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_in_stats(cp, skb); /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pd->pp); + ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); @@ -575,12 +579,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) return NF_ACCEPT; - } - - ip_vs_service_put(svc); /* * Notify the client that the destination is unreachable, and @@ -592,9 +592,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net_ = dev_net(skb_dst(skb)->dev); - skb->dev = net->loopback_dev; + skb->dev = net_->loopback_dev; } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else @@ -647,22 +647,17 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { - int err = ip_defrag(skb, user); + int err; + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); return err; } -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) -{ - /* TODO IPv6: Find out what to do here for IPv6 */ - return 0; -} -#endif - static int ip_vs_route_me_harder(int af, struct sk_buff *skb) { #ifdef CONFIG_IP_VS_IPV6 @@ -733,10 +728,19 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct ipv6hdr *iph = ipv6_hdr(skb); - unsigned int icmp_offset = sizeof(struct ipv6hdr); - struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + - icmp_offset); - struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + unsigned int icmp_offset = 0; + unsigned int offs = 0; /* header offset*/ + int protocol; + struct icmp6hdr *icmph; + struct ipv6hdr *ciph; + unsigned short fragoffs; + + ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); + icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); + offs = icmp_offset + sizeof(struct icmp6hdr); + ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); + + protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); if (inout) { iph->saddr = cp->vaddr.in6; @@ -747,10 +751,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } /* the TCP/UDP/SCTP port */ - if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr || - IPPROTO_SCTP == ciph->nexthdr) { - __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol)) { + __be16 *ports = (void *)(skb_network_header(skb) + offs); + IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, + ntohs(inout ? ports[1] : ports[0]), + ntohs(inout ? cp->vport : cp->dport)); if (inout) ports[1] = cp->vport; else @@ -852,7 +859,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, *related = 1; /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -899,51 +906,35 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); if (!cp) return NF_ACCEPT; snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, offset, ihl); + pp, ciph.len, ihl); } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum) + unsigned int hooknum, struct ip_vs_iphdr *ipvsh) { - struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; + struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset; union nf_inet_addr snet; + unsigned int writable; *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); if (ic == NULL) return NF_DROP; - IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - &iph->saddr, &iph->daddr); - /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy @@ -951,42 +942,45 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (ipvsh->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &ipvsh->saddr, &ipvsh->daddr); /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) + ciph.len = ipvsh->len + sizeof(_icmph); + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); + ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + + pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, - "Checking outgoing ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); if (!cp) return NF_ACCEPT; - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, - pp, offset, sizeof(struct ipv6hdr)); + snet.in6 = ciph.saddr.in6; + writable = ciph.len; + return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, + pp, writable, sizeof(struct ipv6hdr)); } #endif @@ -1015,21 +1009,47 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + sctp_chunkhdr_t *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - struct ip_vs_conn *cp, int ihl) + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); - if (!skb_make_writable(skb, ihl)) + if (!skb_make_writable(skb, iph->len)) goto drop; /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 @@ -1116,17 +1136,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (!net_ipvs(net)->enable) return NF_ACCEPT; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(af, skb, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(skb, &related, - hooknum); + hooknum, &iph); if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif @@ -1136,7 +1155,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } pd = ip_vs_proto_data_get(net, iph.protocol); @@ -1146,45 +1164,35 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, - ip_vs_defrag_user(hooknum))) - return NF_STOLEN; - } - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } else + if (af == AF_INET) #endif - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { + if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); + cp = pp->conn_out_get(af, skb, &iph, 0); if (likely(cp)) - return handle_response(af, skb, pd, cp, iph.len); + return handle_response(af, skb, pd, cp, &iph); if (sysctl_nat_icmp_send(net) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph.len, + sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(net, af, iph.protocol, - &iph.saddr, - pptr[0])) { + if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST @@ -1199,9 +1207,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - struct net *net = - dev_net(skb_dst(skb)->dev); - if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, @@ -1228,11 +1233,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_out(hooknum, skb, AF_INET); + return ip_vs_out(ops->hooknum, skb, AF_INET); } /* @@ -1240,17 +1245,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_out(ops->hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1261,11 +1260,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_out(hooknum, skb, AF_INET6); + return ip_vs_out(ops->hooknum, skb, AF_INET6); } /* @@ -1273,17 +1272,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_out(ops->hooknum, skb, AF_INET6); } #endif @@ -1305,12 +1298,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offset, ihl, verdict; + unsigned int offset, offset2, ihl, verdict; + bool ipip; *related = 1; /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1347,6 +1341,21 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) net = skb_net(skb); + /* Special case for errors for IPIP packets */ + ipip = false; + if (cih->protocol == IPPROTO_IPIP) { + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + /* Error for our IPIP must arrive at LOCAL_IN */ + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) + return NF_ACCEPT; + offset += cih->ihl * 4; + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + ipip = true; + } + pd = ip_vs_proto_data_get(net, cih->protocol); if (!pd) return NF_ACCEPT; @@ -1360,11 +1369,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking incoming ICMP for"); - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); + offset2 = offset; + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; + offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. + * For IPIP this is error for request, not for reply. + */ + cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); if (!cp) return NF_ACCEPT; @@ -1378,51 +1390,95 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) goto out; } + if (ipip) { + __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; + + /* Update the MTU */ + if (ic->type == ICMP_DEST_UNREACH && + ic->code == ICMP_FRAG_NEEDED) { + struct ip_vs_dest *dest = cp->dest; + u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; + + /* Strip outer IP and ICMP, go to IPIP header */ + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; + offset2 -= ihl + sizeof(_icmph); + skb_reset_network_header(skb); + IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); + ipv4_update_pmtu(skb, dev_net(skb->dev), + mtu, 0, 0, 0, 0); + /* Client uses PMTUD? */ + if (!(frag_off & htons(IP_DF))) + goto ignore_ipip; + /* Prefer the resulting PMTU */ + if (dest) { + struct ip_vs_dest_dst *dest_dst; + + rcu_read_lock(); + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + rcu_read_unlock(); + } + if (mtu > 68 + sizeof(struct iphdr)) + mtu -= sizeof(struct iphdr); + info = htonl(mtu); + } + /* Strip outer IP, ICMP and IPIP, go to IP header of + * original request. + */ + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; + skb_reset_network_header(skb); + IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + type, code, ntohl(info)); + icmp_send(skb, type, code, info); + /* ICMP can be shorter but anyways, account it */ + ip_vs_out_stats(cp, skb); + +ignore_ipip: + consume_skb(skb); + verdict = NF_STOLEN; + goto out; + } + /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || + IPPROTO_SCTP == cih->protocol) offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); - out: +out: __ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum, struct ip_vs_iphdr *iph) { struct net *net = NULL; - struct ipv6hdr *iph; + struct ipv6hdr _ip6h, *ip6h; struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offset, verdict; + unsigned int offs_ciph, writable, verdict; *related = 1; - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); if (ic == NULL) return NF_DROP; - IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - &iph->saddr, &iph->daddr); - /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy @@ -1430,47 +1486,71 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (iph->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) + ciph.len = iph->len + sizeof(_icmph); + offs_ciph = ciph.len; /* Save ip header offset */ + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ + ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ net = skb_net(skb); - pd = ip_vs_proto_data_get(net, cih->nexthdr); + pd = ip_vs_proto_data_get(net, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + /* Cannot handle fragmented embedded protocol */ + if (ciph.fragoffs) return NF_ACCEPT; - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, "Checking incoming ICMPv6 for"); - offset += sizeof(struct ipv6hdr); + /* The embedded headers contain source and dest in reverse order + * if not from localhost + */ + cp = pp->conn_in_get(AF_INET6, skb, &ciph, + (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); if (!cp) return NF_ACCEPT; + /* VS/TUN, VS/DR and LOCALNODE just let it go */ + if ((hooknum == NF_INET_LOCAL_OUT) && + (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { + __ip_vs_conn_put(cp); + return NF_ACCEPT; + } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr || - IPPROTO_SCTP == cih->nexthdr) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum); + + /* Need to mangle contained IPv6 header in ICMPv6 packet */ + writable = ciph.len; + if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || + IPPROTO_SCTP == ciph.protocol) + writable += 2 * sizeof(__u16); /* Also mangle ports */ + + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); __ip_vs_conn_put(cp); @@ -1491,7 +1571,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; - int ret, restart, pkts; + int ret, pkts; struct netns_ipvs *ipvs; /* Already marked as IPVS request or reply? */ @@ -1506,7 +1586,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(af, skb, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, @@ -1515,10 +1595,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(af, skb, &iph); /* Bad... Do not break raw sockets */ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && @@ -1534,11 +1615,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, + &iph); if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif @@ -1548,7 +1629,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } /* Protocol supported? */ @@ -1559,12 +1639,24 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, &iph, 0); - if (unlikely(!cp)) { + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && + unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && + is_new_conn(skb, &iph)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } + + if (unlikely(!cp) && !iph.fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ int v; - if (!pp->conn_schedule(af, skb, pd, &v, &cp)) + /* Schedule and create new connection entry into &cp */ + if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) return v; } @@ -1572,11 +1664,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_in: packet continues traversal as normal"); + if (iph.fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); + } return NF_ACCEPT; } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1592,9 +1690,9 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); + ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); @@ -1615,34 +1713,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) else pkts = atomic_add_return(1, &cp->in_pkts); - if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - cp->protocol == IPPROTO_SCTP) { - if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - (cp->old_state != cp->state && - ((cp->state == IP_VS_SCTP_S_CLOSED) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { - ip_vs_sync_conn(net, cp); - goto out; - } - } - - /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(net, cp); -out: - cp->old_state = cp->state; + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, pkts); ip_vs_conn_put(cp); return ret; @@ -1653,12 +1725,12 @@ out: * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_in(hooknum, skb, AF_INET); + return ip_vs_in(ops->hooknum, skb, AF_INET); } /* @@ -1666,17 +1738,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_in(ops->hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1686,12 +1752,12 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_in(hooknum, skb, AF_INET6); + return ip_vs_in(ops->hooknum, skb, AF_INET6); } /* @@ -1699,17 +1765,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_in(ops->hooknum, skb, AF_INET6); } #endif @@ -1725,42 +1785,48 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, * and send them to ip_vs_in_icmp. */ static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { int r; struct net *net; + struct netns_ipvs *ipvs; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(skb, &r, hooknum); + return ip_vs_in_icmp(skb, &r, ops->hooknum); } #ifdef CONFIG_IP_VS_IPV6 static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { int r; struct net *net; + struct netns_ipvs *ipvs; + struct ip_vs_iphdr iphdr; - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp_v6(skb, &r, hooknum); + return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); } #endif @@ -1770,9 +1836,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, - .priority = 99, + .priority = NF_IP_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1780,32 +1846,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_remote_request4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, - .priority = 101, + .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_local_reply4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, - .priority = -99, + .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_local_request4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, - .priority = -98, + .priority = NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, }, @@ -1813,7 +1879,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, }, @@ -1822,9 +1888,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, - .priority = 99, + .priority = NF_IP6_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1832,32 +1898,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_remote_request6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, - .priority = 101, + .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, - .priority = -99, + .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_local_request6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, - .priority = -98, + .priority = NF_IP6_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, @@ -1865,7 +1931,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, }, @@ -1879,10 +1945,9 @@ static int __net_init __ip_vs_init(struct net *net) struct netns_ipvs *ipvs; ipvs = net_generic(net, ip_vs_net_id); - if (ipvs == NULL) { - pr_err("%s(): no memory.\n", __func__); + if (ipvs == NULL) return -ENOMEM; - } + /* Hold the beast until a service is registerd */ ipvs->enable = 0; ipvs->net = net; @@ -1891,22 +1956,22 @@ static int __net_init __ip_vs_init(struct net *net) atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; - if (__ip_vs_estimator_init(net) < 0) + if (ip_vs_estimator_net_init(net) < 0) goto estimator_fail; - if (__ip_vs_control_init(net) < 0) + if (ip_vs_control_net_init(net) < 0) goto control_fail; - if (__ip_vs_protocol_init(net) < 0) + if (ip_vs_protocol_net_init(net) < 0) goto protocol_fail; - if (__ip_vs_app_init(net) < 0) + if (ip_vs_app_net_init(net) < 0) goto app_fail; - if (__ip_vs_conn_init(net) < 0) + if (ip_vs_conn_net_init(net) < 0) goto conn_fail; - if (__ip_vs_sync_init(net) < 0) + if (ip_vs_sync_net_init(net) < 0) goto sync_fail; printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", @@ -1917,35 +1982,38 @@ static int __net_init __ip_vs_init(struct net *net) */ sync_fail: - __ip_vs_conn_cleanup(net); + ip_vs_conn_net_cleanup(net); conn_fail: - __ip_vs_app_cleanup(net); + ip_vs_app_net_cleanup(net); app_fail: - __ip_vs_protocol_cleanup(net); + ip_vs_protocol_net_cleanup(net); protocol_fail: - __ip_vs_control_cleanup(net); + ip_vs_control_net_cleanup(net); control_fail: - __ip_vs_estimator_cleanup(net); + ip_vs_estimator_net_cleanup(net); estimator_fail: + net->ipvs = NULL; return -ENOMEM; } static void __net_exit __ip_vs_cleanup(struct net *net) { - __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */ - __ip_vs_conn_cleanup(net); - __ip_vs_app_cleanup(net); - __ip_vs_protocol_cleanup(net); - __ip_vs_control_cleanup(net); - __ip_vs_estimator_cleanup(net); + ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(net); + ip_vs_app_net_cleanup(net); + ip_vs_protocol_net_cleanup(net); + ip_vs_control_net_cleanup(net); + ip_vs_estimator_net_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) { EnterFunction(2); net_ipvs(net)->enable = 0; /* Disable packet reception */ - __ip_vs_sync_cleanup(net); + smp_wmb(); + ip_vs_sync_net_cleanup(net); LeaveFunction(2); } @@ -1967,36 +2035,23 @@ static int __init ip_vs_init(void) { int ret; - ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { pr_err("can't setup control.\n"); - goto cleanup_estimator; + goto exit; } ip_vs_protocol_init(); - ret = ip_vs_app_init(); - if (ret < 0) { - pr_err("can't setup application helper.\n"); - goto cleanup_protocol; - } - ret = ip_vs_conn_init(); if (ret < 0) { pr_err("can't setup connection table.\n"); - goto cleanup_app; - } - - ret = ip_vs_sync_init(); - if (ret < 0) { - pr_err("can't setup sync data.\n"); - goto cleanup_conn; + goto cleanup_protocol; } ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ if (ret < 0) - goto cleanup_sync; + goto cleanup_conn; ret = register_pernet_device(&ipvs_core_dev_ops); if (ret < 0) @@ -2008,39 +2063,40 @@ static int __init ip_vs_init(void) goto cleanup_dev; } + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; + } + pr_info("ipvs loaded.\n"); return ret; +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: unregister_pernet_subsys(&ipvs_core_ops); -cleanup_sync: - ip_vs_sync_cleanup(); - cleanup_conn: +cleanup_conn: ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: +cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); - cleanup_estimator: - ip_vs_estimator_cleanup(); +exit: return ret; } static void __exit ip_vs_cleanup(void) { + ip_vs_unregister_nl_ioctl(); nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ - ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); - ip_vs_estimator_cleanup(); pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 699c79a5565..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -55,9 +55,6 @@ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG @@ -71,24 +68,24 @@ int ip_vs_get_debug_level(void) /* Protos */ -static void __ip_vs_del_service(struct ip_vs_service *svc); +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(struct net *net, - const struct in6_addr *addr) +static bool __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) { - struct rt6_info *rt; struct flowi6 fl6 = { .daddr = *addr, }; + struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); + bool is_local; - rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); - if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) - return 1; + is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); - return 0; + dst_release(dst); + return is_local; } #endif @@ -257,36 +254,38 @@ ip_vs_use_count_dec(void) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ -static inline unsigned -ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, +static inline unsigned int +ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; + __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - addr_fold ^= ((size_t)net>>8); + ahash = ntohl(addr_fold); + ahash ^= ((size_t) net >> 8); - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) { return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } @@ -298,7 +297,7 @@ static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { - unsigned hash; + unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pF\n", @@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) */ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, &svc->addr, svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* Remove it from the svc_table table */ - list_del(&svc->s_list); + hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ - list_del(&svc->f_list); + hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; @@ -361,13 +360,13 @@ static inline struct ip_vs_service * __ip_vs_service_find(struct net *net, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) @@ -388,13 +387,13 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(net, fwmark); - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && net_eq(svc->net, net)) { /* HIT */ @@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) return NULL; } +/* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; struct netns_ipvs *ipvs = net_ipvs(net); - read_lock(&__ip_vs_svc_lock); - /* * Check the table hashed by fwmark first */ @@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, } out: - if (svc) - atomic_inc(&svc->usecnt); - read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), @@ -466,22 +460,35 @@ static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); - dest->svc = svc; + rcu_assign_pointer(dest->svc, svc); } -static void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) +static void ip_vs_service_free(struct ip_vs_service *svc) { - struct ip_vs_service *svc = dest->svc; + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); +} - dest->svc = NULL; +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} + +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{ if (atomic_dec_and_test(&svc->refcnt)) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); + ntohs(svc->port)); + if (do_delay) + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); + else + ip_vs_service_free(svc); } } @@ -489,11 +496,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) /* * Returns hash value for real service */ -static inline unsigned ip_vs_rs_hashkey(int af, +static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -506,17 +513,13 @@ static inline unsigned ip_vs_rs_hashkey(int af, & IP_VS_RTAB_MASK; } -/* - * Hashes ip_vs_dest in rs_table by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { - unsigned hash; + unsigned int hash; - if (!list_empty(&dest->d_list)) { - return 0; - } + if (dest->in_rs_table) + return; /* * Hash by proto,addr,port, @@ -524,65 +527,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ipvs->rs_table[hash]); - - return 1; + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; } -/* - * UNhashes ip_vs_dest from rs_table. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; } - - return 1; } -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) { struct netns_ipvs *ipvs = net_ipvs(net); - unsigned hash; + unsigned int hash; struct ip_vs_dest *dest; - /* - * Check for "full" addressed entries - * Return the first found entry - */ + /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&ipvs->rs_lock); - list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { + rcu_read_lock(); + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { /* HIT */ - read_unlock(&ipvs->rs_lock); - return dest; + rcu_read_unlock(); + return true; } } - read_unlock(&ipvs->rs_lock); + rcu_read_unlock(); - return NULL; + return false; } -/* - * Lookup destination by {addr,port} in the given service +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, @@ -593,7 +582,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find the destination for the given service */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == svc->af) && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && (dest->port == dport)) { @@ -607,33 +596,56 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in + * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. + * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol, __u32 fwmark) + __be16 vport, __u16 protocol, __u32 fwmark, + __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; + __be16 port = dport; - svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); + if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) + port = 0; + dest = ip_vs_lookup_dest(svc, daddr, port); + if (!dest) + dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); return dest; } +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed @@ -648,13 +660,14 @@ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { - struct ip_vs_dest *dest, *nxt; + struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, @@ -670,29 +683,29 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + list_del(&dest->t_list); + ip_vs_dest_hold(dest); + goto out; } } - return NULL; + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; } +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + + __ip_vs_dst_cache_reset(dest); + __ip_vs_svc_put(svc, false); + free_percpu(dest->stats.cpustats); + ip_vs_dest_put_and_free(dest); +} /* * Clean up all the destinations in the trash @@ -701,19 +714,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. + * be 0, so we simply release them here. */ static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); } } @@ -763,6 +775,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_service *old_svc; + struct ip_vs_scheduler *sched; int conn_flags; /* set the weight and the flags */ @@ -778,20 +792,19 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_hash(ipvs, dest); - write_unlock_bh(&ipvs->rs_lock); } atomic_set(&dest->conn_flags, conn_flags); /* bind the service */ - if (!dest->svc) { + old_svc = rcu_dereference_protected(dest->svc, 1); + if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); + if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); + __ip_vs_svc_put(old_svc, true); } } @@ -804,27 +817,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->l_threshold = udest->l_threshold; spin_lock_bh(&dest->dst_lock); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - if (add) - ip_vs_start_estimator(svc->net, &dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - + sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { - list_add(&dest->n_list, &svc->destinations); + ip_vs_start_estimator(svc->net, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; + if (sched->add_dest) + sched->add_dest(svc, dest); + } else { + if (sched->upd_dest) + sched->upd_dest(svc, dest); } - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); } @@ -836,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { struct ip_vs_dest *dest; - unsigned atype; + unsigned int atype, i; EnterFunction(2); @@ -856,14 +862,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, } dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); - if (dest == NULL) { - pr_err("%s(): no memory.\n", __func__); + if (dest == NULL) return -ENOMEM; - } + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!dest->stats.cpustats) { - pr_err("%s() alloc_percpu failed\n", __func__); + if (!dest->stats.cpustats) goto err_alloc; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_dest_stats; + ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); + u64_stats_init(&ip_vs_dest_stats->syncp); } dest->af = svc->af; @@ -879,7 +888,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->persistconns, 0); atomic_set(&dest->refcnt, 1); - INIT_LIST_HEAD(&dest->d_list); + INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); @@ -921,10 +930,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Check if the dest already exists in the list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); @@ -946,11 +955,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - __ip_vs_update_dest(svc, dest, udest, 1); ret = 0; } else { @@ -990,10 +994,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Lookup the destination list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); @@ -1006,11 +1010,11 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } - /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, + bool cleanup) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1019,38 +1023,20 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&ipvs->rs_lock); - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", - dest->vfwmark, - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port)); - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - free_percpu(dest->stats.cpustats); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ipvs->dest_trash); - atomic_inc(&dest->refcnt); - } + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash without reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = 0; + spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_dest_put(dest); } @@ -1066,14 +1052,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, /* * Remove it from the d-linked destination list. */ - list_del(&dest->n_list); + list_del_rcu(&dest->n_list); svc->num_dests--; - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched->del_dest) + sched->del_dest(svc, dest); + } } @@ -1088,37 +1076,62 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); - write_unlock_bh(&__ip_vs_svc_lock); - /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, false); LeaveFunction(2); return 0; } +static void ip_vs_dest_trash_expire(unsigned long data) +{ + struct net *net = (struct net *) data; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest *dest, *next; + unsigned long now = jiffies; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + if (atomic_read(&dest->refcnt) > 0) + continue; + if (dest->idle_start) { + if (time_before(now, dest->idle_start + + IP_VS_DEST_TRASH_PERIOD)) + continue; + } else { + dest->idle_start = max(1UL, now); + continue; + } + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + spin_unlock(&ipvs->dest_trash_lock); +} /* * Add a service into the service hash table @@ -1127,7 +1140,7 @@ static int ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0; + int ret = 0, i; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; @@ -1155,9 +1168,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out_err; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } } #endif @@ -1169,12 +1186,18 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); if (!svc->stats.cpustats) { - pr_err("%s() alloc_percpu failed\n", __func__); + ret = -ENOMEM; goto out_err; } + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_stats; + ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); + u64_stats_init(&ip_vs_stats->syncp); + } + + /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 0); atomic_set(&svc->refcnt, 0); svc->af = u->af; @@ -1188,7 +1211,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->net = net; INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ @@ -1198,7 +1221,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, sched = NULL; /* Bind the ct retriever */ - ip_vs_bind_pe(svc, pe); + RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Update the virtual service counters */ @@ -1214,9 +1237,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ipvs->num_services++; /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; /* Now there is a service - full throttle */ @@ -1226,15 +1247,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, out_err: if (svc != NULL) { - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - if (svc->stats.cpustats) - free_percpu(svc->stats.cpustats); - kfree(svc); + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); @@ -1278,18 +1292,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } } #endif - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + /* Bind the new scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + old_sched = sched; + goto out; + } + /* Unbind the old scheduler on success */ + ip_vs_unbind_scheduler(svc, old_sched); + } /* * Set the flags and timeout value @@ -1298,57 +1321,22 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - old_pe = svc->pe; - if (pe != old_pe) { - ip_vs_unbind_pe(svc); - ip_vs_bind_pe(svc, pe); - } + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) + rcu_assign_pointer(svc->pe, pe); - out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); - out: +out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } - /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ -static void __ip_vs_del_service(struct ip_vs_service *svc) +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; @@ -1364,27 +1352,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) ip_vs_stop_estimator(svc->net, &svc->stats); /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); - /* Unbind persistence engine */ - old_pe = svc->pe; - ip_vs_unbind_pe(svc); + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); ip_vs_pe_put(old_pe); - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, cleanup); } /* @@ -1398,14 +1379,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", - svc->fwmark, - IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); - } + __ip_vs_svc_put(svc, true); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1414,23 +1388,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ -static void ip_vs_unlink_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); + __ip_vs_del_service(svc, cleanup); } /* @@ -1440,7 +1407,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, false); return 0; } @@ -1449,19 +1416,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net) +static int ip_vs_flush(struct net *net, bool cleanup) { int idx; - struct ip_vs_service *svc, *nxt; + struct ip_vs_service *svc; + struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], - s_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1469,10 +1437,10 @@ static int ip_vs_flush(struct net *net) * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1483,76 +1451,79 @@ static int ip_vs_flush(struct net *net) * Delete service by {netns} in the service table. * Called by __ip_vs_cleanup() */ -void __ip_vs_service_cleanup(struct net *net) +void ip_vs_service_net_cleanup(struct net *net) { EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net); + ip_vs_flush(net, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } -/* - * Release dst hold by dst_cache - */ + +/* Put all references for device (dst_cache) */ static inline void -__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { + struct ip_vs_dest_dst *dest_dst; + spin_lock_bh(&dest->dst_lock); - if (dest->dst_cache && dest->dst_cache->dev == dev) { + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } -/* - * Netdev event receiver - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to - * a device that is "unregister" it must be released. +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; - if (event != NETDEV_UNREGISTER) + if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); EnterFunction(2); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } } - list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { - __ip_vs_dev_reset(dest, dev); + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); } + spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); return NOTIFY_DONE; @@ -1565,12 +1536,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; - write_lock_bh(&__ip_vs_svc_lock); list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); return 0; } @@ -1580,14 +1549,14 @@ static int ip_vs_zero_all(struct net *net) struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } @@ -1598,8 +1567,12 @@ static int ip_vs_zero_all(struct net *net) } #ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3; + static int -proc_do_defense_mode(ctl_table *table, int write, +proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; @@ -1620,7 +1593,7 @@ proc_do_defense_mode(ctl_table *table, int write, } static int -proc_do_sync_threshold(ctl_table *table, int write, +proc_do_sync_threshold(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1631,7 +1604,8 @@ proc_do_sync_threshold(ctl_table *table, int write, memcpy(val, valp, sizeof(val)); rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + if (write && (valp[0] < 0 || valp[1] < 0 || + (valp[0] >= valp[1] && valp[1]))) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); } @@ -1639,7 +1613,7 @@ proc_do_sync_threshold(ctl_table *table, int write, } static int -proc_do_sync_mode(ctl_table *table, int write, +proc_do_sync_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1651,9 +1625,24 @@ proc_do_sync_mode(ctl_table *table, int write, if ((*valp < 0) || (*valp > 1)) { /* Restore the correct value */ *valp = val; - } else { - struct net *net = current->nsproxy->net_ns; - ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} + +static int +proc_do_sync_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (*valp < 1 || !is_power_of_2(*valp)) { + /* Restore the correct value */ + *valp = val; } } return rc; @@ -1662,7 +1651,7 @@ proc_do_sync_mode(ctl_table *table, int write, /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without - * align with netns init in __ip_vs_control_init() + * align with netns init in ip_vs_control_net_init() */ static struct ctl_table vs_vars[] = { @@ -1717,6 +1706,30 @@ static struct ctl_table vs_vars[] = { .proc_handler = &proc_do_sync_mode, }, { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_ports, + }, + { + .procname = "sync_persist_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, @@ -1729,6 +1742,18 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, { + .procname = "sloppy_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_sctp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "expire_quiescent_template", .maxlen = sizeof(int), .mode = 0644, @@ -1742,11 +1767,37 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_do_sync_threshold, }, { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "pmtu_disc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1845,20 +1896,13 @@ static struct ctl_table vs_vars[] = { { } }; -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); #endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - struct list_head *table; + struct hlist_head *table; int bucket; }; @@ -1866,7 +1910,7 @@ struct ip_vs_iter { * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ -static inline const char *ip_vs_fwd_name(unsigned flags) +static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: @@ -1891,7 +1935,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; @@ -1902,7 +1946,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; @@ -1915,17 +1960,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) + __acquires(RCU) { - - read_lock_bh(&__ip_vs_svc_lock); + rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *e; + struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; @@ -1938,13 +1982,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { return svc; } } @@ -1955,13 +2000,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) return svc; } @@ -1969,9 +2016,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) + __releases(RCU) { - read_unlock_bh(&__ip_vs_svc_lock); + rcu_read_unlock(); } @@ -1989,6 +2036,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1997,18 +2045,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - svc->scheduler->name); + sched->name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - svc->scheduler->name, + sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, svc->scheduler->name, + svc->fwmark, sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2019,7 +2067,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) else seq_putc(seq, '\n'); - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, @@ -2113,7 +2161,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; - struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_stats_user rates; int i; @@ -2129,10 +2177,10 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) __u64 inbytes, outbytes; do { - start = u64_stats_fetch_begin_bh(&u->syncp); + start = u64_stats_fetch_begin_irq(&u->syncp); inbytes = u->ustats.inbytes; outbytes = u->ustats.outbytes; - } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", i, u->ustats.conns, u->ustats.inpkts, @@ -2283,8 +2331,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ip_vs_service *svc; struct ip_vs_dest_user *udest_compat; struct ip_vs_dest_user_kern udest; + struct netns_ipvs *ipvs = net_ipvs(net); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) @@ -2303,6 +2352,24 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* increase the module use count */ ip_vs_use_count_inc(); + /* Handle daemons since they have another lock */ + if (cmd == IP_VS_SO_SET_STARTDAEMON || + cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + + if (mutex_lock_interruptible(&ipvs->sync_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + if (cmd == IP_VS_SO_SET_STARTDAEMON) + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); + else + ret = stop_sync_thread(net, dm->state); + mutex_unlock(&ipvs->sync_mutex); + goto out_dec; + } + if (mutex_lock_interruptible(&__ip_vs_mutex)) { ret = -ERESTARTSYS; goto out_dec; @@ -2310,21 +2377,12 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(net, dm->state, dm->mcast_ifn, - dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(net, dm->state); - goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; @@ -2354,11 +2412,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) } /* Lookup the exact service by <protocol, addr, port> or fwmark */ + rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2410,11 +2470,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2433,7 +2496,7 @@ __ip_vs_get_service_entries(struct net *net, int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2452,7 +2515,7 @@ __ip_vs_get_service_entries(struct net *net, } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2469,7 +2532,7 @@ __ip_vs_get_service_entries(struct net *net, count++; } } - out: +out: return ret; } @@ -2481,17 +2544,20 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; + rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, get->port); + rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; + memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; @@ -2525,6 +2591,8 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) struct ip_vs_proto_data *pd; #endif + memset(u, 0, sizeof (*u)); + #ifdef CONFIG_IP_VS_PROTO_TCP pd = ip_vs_proto_data_get(net, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; @@ -2566,7 +2634,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) @@ -2584,6 +2652,33 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; + /* + * Handle daemons first since it has its own locking + */ + if (cmd == IP_VS_SO_GET_DAEMON) { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (mutex_lock_interruptible(&ipvs->sync_mutex)) + return -ERESTARTSYS; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; + } + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + mutex_unlock(&ipvs->sync_mutex); + return ret; + } if (mutex_lock_interruptible(&__ip_vs_mutex)) return -ERESTARTSYS; @@ -2639,12 +2734,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; + rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(net, AF_INET, entry->protocol, &addr, entry->port); + rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2681,33 +2778,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } break; - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ipvs->sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, - sizeof(d[0].mcast_ifn)); - d[0].syncid = ipvs->master_syncid; - } - if (ipvs->sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, - sizeof(d[1].mcast_ifn)); - d[1].syncid = ipvs->backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - default: ret = -EINVAL; } - out: +out: mutex_unlock(&__ip_vs_mutex); return ret; } @@ -2800,17 +2875,17 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, ip_vs_copy_stats(&ustats, stats); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps); - + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; @@ -2823,6 +2898,8 @@ nla_put_failure: static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; @@ -2831,23 +2908,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, if (!nl_service) return -EMSGSIZE; - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - + if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) + goto nla_put_failure; if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) + goto nla_put_failure; } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || + nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + goto nla_put_failure; } - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - if (svc->pe) - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - + sched = rcu_dereference_protected(svc->scheduler, 1); + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || + nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || + nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) goto nla_put_failure; @@ -2866,7 +2946,7 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb, { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) @@ -2892,7 +2972,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -2903,7 +2983,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -2959,15 +3039,17 @@ static int ip_vs_genl_parse_service(struct net *net, } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); + usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } + rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, &usvc->addr, usvc->port); + rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ @@ -2997,7 +3079,7 @@ static int ip_vs_genl_parse_service(struct net *net, usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); + usvc->netmask = nla_get_be32(nla_netmask); } return 0; @@ -3022,21 +3104,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) if (!nl_dest) return -EMSGSIZE; - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - + if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, + (atomic_read(&dest->conn_flags) & + IP_VS_CONN_F_FWD_MASK)) || + nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, + atomic_read(&dest->weight)) || + nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns))) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3054,7 +3137,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) @@ -3131,7 +3214,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); + udest->port = nla_get_be16(nla_port); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -3156,8 +3239,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, return 0; } -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid) { struct nlattr *nl_daemon; @@ -3165,10 +3248,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, if (!nl_daemon) return -EMSGSIZE; - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - + if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3178,12 +3261,12 @@ nla_put_failure: return -EMSGSIZE; } -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid, struct netlink_callback *cb) { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) @@ -3205,7 +3288,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct net *net = skb_sknet(skb); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, ipvs->master_mcast_ifn, @@ -3225,7 +3308,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, } nla_put_failure: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->sync_mutex); return skb->len; } @@ -3271,13 +3354,9 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) return ip_vs_set_timeout(net, &t); } -static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { - struct ip_vs_service *svc = NULL; - struct ip_vs_service_user_kern usvc; - struct ip_vs_dest_user_kern udest; int ret = 0, cmd; - int need_full_svc = 0, need_full_dest = 0; struct net *net; struct netns_ipvs *ipvs; @@ -3285,19 +3364,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; - mutex_lock(&__ip_vs_mutex); - - if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net); - goto out; - } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(net, info->attrs); - goto out; - } else if (cmd == IPVS_CMD_NEW_DAEMON || - cmd == IPVS_CMD_DEL_DAEMON) { - + if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + mutex_lock(&ipvs->sync_mutex); if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], @@ -3310,6 +3380,31 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) ret = ip_vs_genl_new_daemon(net, daemon_attrs); else ret = ip_vs_genl_del_daemon(net, daemon_attrs); +out: + mutex_unlock(&ipvs->sync_mutex); + } + return ret; +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + struct net *net; + + net = skb_sknet(skb); + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(net, false); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(net, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { @@ -3392,10 +3487,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) void *reply; int ret, cmd, reply_cmd; struct net *net; - struct netns_ipvs *ipvs; net = skb_sknet(skb); - ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3447,21 +3540,26 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) __ip_vs_get_timeouts(net, &t); #ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, + t.tcp_timeout) || + nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout)) + goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) + goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - ip_vs_conn_tab_size); + if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, + IP_VS_VERSION_CODE) || + nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size)) + goto nla_put_failure; break; } @@ -3482,7 +3580,7 @@ out: } -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .flags = GENL_ADMIN_PERM, @@ -3536,13 +3634,13 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = { .cmd = IPVS_CMD_NEW_DAEMON, .flags = GENL_ADMIN_PERM, .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, + .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, .flags = GENL_ADMIN_PERM, .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, + .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, @@ -3581,7 +3679,7 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = { static int __init ip_vs_genl_register(void) { return genl_register_family_with_ops(&ip_vs_genl_family, - ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops)); + ip_vs_genl_ops); } static void ip_vs_genl_unregister(void) @@ -3595,7 +3693,7 @@ static void ip_vs_genl_unregister(void) * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL -int __net_init __ip_vs_control_init_sysctl(struct net *net) +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { int idx; struct netns_ipvs *ipvs = net_ipvs(net); @@ -3610,6 +3708,10 @@ int __net_init __ip_vs_control_init_sysctl(struct net *net) tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -3628,18 +3730,33 @@ int __net_init __ip_vs_control_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; + tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + ipvs->sysctl_pmtu_disc = 1; + tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; - ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, - tbl); + ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { if (!net_eq(net, &init_net)) kfree(tbl); @@ -3654,19 +3771,20 @@ int __net_init __ip_vs_control_init_sysctl(struct net *net) return 0; } -void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } #else -int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; } -void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } #endif @@ -3674,35 +3792,41 @@ static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, }; -int __net_init __ip_vs_control_init(struct net *net) +int __net_init ip_vs_control_net_init(struct net *net) { - int idx; + int i, idx; struct netns_ipvs *ipvs = net_ipvs(net); - ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); - /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) - INIT_LIST_HEAD(&ipvs->rs_table[idx]); + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, + (unsigned long) net); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats.cpustats) { - pr_err("%s(): alloc_percpu.\n", __func__); + if (!ipvs->tot_stats.cpustats) return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ipvs_tot_stats; + ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); + u64_stats_init(&ipvs_tot_stats->syncp); } + spin_lock_init(&ipvs->tot_stats.lock); - proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); - proc_net_fops_create(net, "ip_vs_stats_percpu", 0, - &ip_vs_stats_percpu_fops); + proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, net->proc_net, + &ip_vs_stats_percpu_fops); - if (__ip_vs_control_init_sysctl(net)) + if (ip_vs_control_net_init_sysctl(net)) goto err; return 0; @@ -3712,34 +3836,22 @@ err: return -ENOMEM; } -void __net_exit __ip_vs_control_cleanup(struct net *net) +void __net_exit ip_vs_control_net_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_stop_estimator(net, &ipvs->tot_stats); - __ip_vs_control_cleanup_sysctl(net); - proc_net_remove(net, "ip_vs_stats_percpu"); - proc_net_remove(net, "ip_vs_stats"); - proc_net_remove(net, "ip_vs"); + ip_vs_control_net_cleanup_sysctl(net); + remove_proc_entry("ip_vs_stats_percpu", net->proc_net); + remove_proc_entry("ip_vs_stats", net->proc_net); + remove_proc_entry("ip_vs", net->proc_net); free_percpu(ipvs->tot_stats.cpustats); } -int __init ip_vs_control_init(void) +int __init ip_vs_register_nl_ioctl(void) { - int idx; int ret; - EnterFunction(2); - - /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - - smp_wmb(); /* Do we really need it now ? */ - ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); @@ -3751,27 +3863,47 @@ int __init ip_vs_control_init(void) pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } - - ret = register_netdevice_notifier(&ip_vs_dst_notifier); - if (ret < 0) - goto err_notf; - - LeaveFunction(2); return 0; -err_notf: - ip_vs_genl_unregister(); err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + void ip_vs_control_cleanup(void) { EnterFunction(2); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); + unregister_netdevice_notifier(&ip_vs_dst_notifier); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 95fd0d14200..c3b84546ea9 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -51,7 +51,7 @@ * IPVS DH bucket */ struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -64,11 +64,15 @@ struct ip_vs_dh_bucket { #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS DH entry */ -static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -85,10 +89,9 @@ static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } @@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); p = p->next; } @@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -145,52 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - pr_err("%s(): no memory\n", __func__); + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - } - svc->sched_data = tbl; + + svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); return 0; } -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; + struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + ip_vs_dh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; } -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + ip_vs_dh_reassign(s, svc); return 0; } @@ -210,27 +214,26 @@ static inline int is_overloaded(struct ip_vs_dest *dest) * Destination hashing scheduling */ static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + struct ip_vs_dh_state *s; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph->daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); @@ -249,7 +252,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; @@ -263,6 +267,7 @@ static int __init ip_vs_dh_init(void) static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 508cce98777..1425e9a924c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -56,15 +56,16 @@ * Make a summary from each cpu */ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, - struct ip_vs_cpu_stats *stats) + struct ip_vs_cpu_stats __percpu *stats) { int i; + bool add = false; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; __u64 inbytes, outbytes; - if (i) { + if (add) { sum->conns += s->ustats.conns; sum->inpkts += s->ustats.inpkts; sum->outpkts += s->ustats.outpkts; @@ -76,6 +77,7 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, sum->inbytes += inbytes; sum->outbytes += outbytes; } else { + add = true; sum->conns = s->ustats.conns; sum->inpkts = s->ustats.inpkts; sum->outpkts = s->ustats.outpkts; @@ -192,7 +194,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst, dst->outbps = (e->outbps + 0xF) >> 5; } -int __net_init __ip_vs_estimator_init(struct net *net) +int __net_init ip_vs_estimator_net_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -203,16 +205,7 @@ int __net_init __ip_vs_estimator_init(struct net *net) return 0; } -void __net_exit __ip_vs_estimator_cleanup(struct net *net) +void __net_exit ip_vs_estimator_net_cleanup(struct net *net) { del_timer_sync(&net_ipvs(net)->est_timer); } - -int __init ip_vs_estimator_init(void) -{ - return 0; -} - -void ip_vs_estimator_cleanup(void) -{ -} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index af63553fa33..77c173282f3 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -44,16 +44,17 @@ #include <net/ip_vs.h> -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " +#define SERVER_STRING "227 " +#define CLIENT_STRING "PORT" /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. */ +static unsigned int ports_count = 1; static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); +module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); @@ -79,14 +80,17 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) /* * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. + * with the "pattern", ignoring before "skip" and terminated with + * the "term" character. * <addr,port> is in network order. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, + const char *pattern, size_t plen, + char skip, char term, __be32 *addr, __be16 *port, char **start, char **end) { + char *s, c; unsigned char p[6]; int i = 0; @@ -101,19 +105,38 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (strnicmp(data, pattern, plen) != 0) { return 0; } - *start = data + plen; + s = data + plen; + if (skip) { + int found = 0; + + for (;; s++) { + if (s == data_limit) + return -1; + if (!found) { + if (*s == skip) + found = 1; + } else if (*s != skip) { + break; + } + } + } - for (data = *start; *data != term; data++) { + for (data = s; ; data++) { if (data == data_limit) return -1; + if (*data == term) + break; } *end = data; memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { + for (data = s; ; data++) { + c = *data; + if (c == term) + break; + if (c >= '0' && c <= '9') { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { i++; } else { /* unexpected character */ @@ -124,8 +147,9 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (i != 5) return -1; - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); + *start = s; + *addr = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); return 1; } @@ -153,7 +177,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; + unsigned int buf_len; int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -185,7 +209,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', + sizeof(SERVER_STRING)-1, + '(', ')', &from.ip, &port, &start, &end) != 1) return 1; @@ -242,9 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * hopefully it will succeed on the retransmitted * packet. */ + rcu_read_lock(); ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + iph->ihl * 4, start-data, end-start, buf, buf_len); + rcu_read_unlock(); if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); @@ -345,7 +373,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to.ip, &port, + ' ', '\r', &to.ip, &port, &start, &end) != 1) return 1; @@ -414,18 +442,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net) struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); - app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL); - if (!app) - return -ENOMEM; - INIT_LIST_HEAD(&app->a_list); - INIT_LIST_HEAD(&app->incs_list); - ipvs->ftp_app = app; + if (!ipvs) + return -ENOENT; - ret = register_ip_vs_app(net, app); - if (ret) - goto err_exit; + app = register_ip_vs_app(net, &ip_vs_ftp); + if (IS_ERR(app)) + return PTR_ERR(app); - for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { + for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); @@ -437,9 +461,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) return 0; err_unreg: - unregister_ip_vs_app(net, app); -err_exit: - kfree(ipvs->ftp_app); + unregister_ip_vs_app(net, &ip_vs_ftp); return ret; } /* @@ -447,10 +469,7 @@ err_exit: */ static void __ip_vs_ftp_exit(struct net *net) { - struct netns_ipvs *ipvs = net_ipvs(net); - - unregister_ip_vs_app(net, ipvs->ftp_app); - kfree(ipvs->ftp_app); + unregister_ip_vs_app(net, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { @@ -458,11 +477,12 @@ static struct pernet_operations ip_vs_ftp_ops = { .exit = __ip_vs_ftp_exit, }; -int __init ip_vs_ftp_init(void) +static int __init ip_vs_ftp_init(void) { int rv; rv = register_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns on error */ return rv; } @@ -472,6 +492,7 @@ int __init ip_vs_ftp_init(void) static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 87e40ea77a9..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -90,11 +90,12 @@ * IP address and its destination server */ struct ip_vs_lblc_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -102,12 +103,14 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -115,7 +118,7 @@ struct ip_vs_lblc_table { * IPVS LBLC sysctl table */ #ifdef CONFIG_SYSCTL -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", .data = NULL, @@ -127,22 +130,26 @@ static ctl_table vs_vars_table[] = { }; #endif -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +static void ip_vs_lblc_rcu_free(struct rcu_head *head) { - list_del(&en->list); - /* - * We don't kfree dest because it is referred either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); + struct ip_vs_lblc_entry *en = container_of(head, + struct ip_vs_lblc_entry, + rcu_head); + + ip_vs_dest_put_and_free(en->dest); kfree(en); } +static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) +{ + hlist_del_rcu(&en->list); + call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); +} /* * Returns hash value for IPVS LBLC entry */ -static inline unsigned +static inline unsigned int ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -163,25 +170,22 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { - unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ +/* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblc_hashkey(af, addr); + unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -191,7 +195,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. + * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, @@ -200,26 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, struct ip_vs_lblc_entry *en; en = ip_vs_lblc_get(dest->af, tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - pr_err("%s(): no memory\n", __func__); - return NULL; - } + if (en) { + if (en->dest == dest) + return en; + ip_vs_lblc_del(en); + } + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); - en->lastuse = jiffies; + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; - atomic_inc(&dest->refcnt); - en->dest = dest; + ip_vs_dest_hold(dest); + en->dest = dest; - ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } + ip_vs_lblc_hash(tbl, en); return en; } @@ -228,17 +229,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +static void ip_vs_lblc_flush(struct ip_vs_service *svc) { - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; int i; - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblc_free(en); + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblc_expiration(struct ip_vs_service *svc) @@ -254,24 +260,25 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc) static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; unsigned long now = jiffies; int i, j; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc))) continue; - ip_vs_lblc_free(en); + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -295,7 +302,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -313,26 +321,26 @@ static void ip_vs_lblc_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; - ip_vs_lblc_free(en); + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } @@ -344,11 +352,10 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - pr_err("%s(): no memory\n", __func__); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) return -ENOMEM; - } + svc->sched_data = tbl; IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(*tbl)); @@ -356,12 +363,13 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -374,7 +382,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; @@ -382,14 +390,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); + ip_vs_lblc_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -411,7 +417,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { @@ -426,13 +432,13 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -460,7 +466,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -475,20 +481,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_iphdr iph; struct ip_vs_dest *dest = NULL; struct ip_vs_lblc_entry *en; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); + en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); if (en) { /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; @@ -502,14 +505,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * free up entries from the trash at any time. */ - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; + dest = en->dest; + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); @@ -519,13 +519,14 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); return dest; @@ -535,8 +536,7 @@ out: /* * IPVS LBLC Scheduler structure */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, @@ -554,20 +554,27 @@ static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblc_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblc_ctl_table == NULL) return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblc_ctl_table[0].procname = NULL; + } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; ipvs->lblc_ctl_header = - register_net_sysctl_table(net, net_vs_ctl_path, - ipvs->lblc_ctl_table); + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); @@ -617,6 +624,7 @@ static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); unregister_pernet_subsys(&ip_vs_lblc_ops); + rcu_barrier(); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 90f618ab6dd..3f21a2f47de 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -89,42 +89,49 @@ */ struct ip_vs_dest_set_elem { struct list_head list; /* list link */ - struct ip_vs_dest *dest; /* destination server */ + struct ip_vs_dest *dest; /* destination server */ + struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ struct list_head list; /* destination list */ - rwlock_t lock; /* lock for this list */ }; -static struct ip_vs_dest_set_elem * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) { struct ip_vs_dest_set_elem *e; - list_for_each_entry(e, &set->list, list) { - if (e->dest == dest) - /* already existed */ - return NULL; + if (check) { + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + return; + } } e = kmalloc(sizeof(*e), GFP_ATOMIC); - if (e == NULL) { - pr_err("%s(): no memory\n", __func__); - return NULL; - } + if (e == NULL) + return; - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); e->dest = dest; - list_add(&e->list, &set->list); + list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; - return e; +} + +static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_set_elem *e; + + e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); + ip_vs_dest_put_and_free(e->dest); + kfree(e); } static void @@ -137,9 +144,8 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) /* HIT */ atomic_dec(&set->size); set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); break; } } @@ -149,17 +155,10 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e, *ep; - write_lock(&set->lock); list_for_each_entry_safe(e, ep, &set->list, list) { - /* - * We don't kfree dest because it is referred either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); } - write_unlock(&set->lock); } /* get weighted least-connection node in the destination set */ @@ -169,11 +168,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) struct ip_vs_dest *dest, *least; int loh, doh; - if (set == NULL) - return NULL; - /* select the first destination server, whose weight > 0 */ - list_for_each_entry(e, &set->list, list) { + list_for_each_entry_rcu(e, &set->list, list) { least = e->dest; if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -188,14 +184,14 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* find the destination with the weighted least load */ nextstage: - list_for_each_entry(e, &set->list, list) { + list_for_each_entry_continue_rcu(e, &set->list, list) { dest = e->dest; if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) + if (((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { least = dest; loh = doh; @@ -236,12 +232,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* find the destination with the weighted most load */ nextstage: - list_for_each_entry(e, &set->list, list) { + list_for_each_entry_continue(e, &set->list, list) { dest = e->dest; doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) + if (((__s64)moh * atomic_read(&dest->weight) < + (__s64)doh * atomic_read(&most->weight)) && (atomic_read(&dest->weight) > 0)) { most = dest; moh = doh; @@ -264,11 +260,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) * IP address and its destination server set */ struct ip_vs_lblcr_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -276,12 +273,14 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -290,7 +289,7 @@ struct ip_vs_lblcr_table { * IPVS LBLCR sysctl table */ -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", .data = NULL, @@ -304,16 +303,16 @@ static ctl_table vs_vars_table[] = { static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { - list_del(&en->list); + hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); - kfree(en); + kfree_rcu(en, rcu_head); } /* * Returns hash value for IPVS LBLCR entry */ -static inline unsigned +static inline unsigned int ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -334,25 +333,22 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { - unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblcr_hashkey(af, addr); + unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -362,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. + * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, @@ -373,10 +369,8 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, en = ip_vs_lblcr_get(dest->af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - pr_err("%s(): no memory\n", __func__); + if (!en) return NULL; - } en->af = dest->af; ip_vs_addr_copy(dest->af, &en->addr, daddr); @@ -385,14 +379,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* initialize its dest set */ atomic_set(&(en->set.size), 0); INIT_LIST_HEAD(&en->set.list); - rwlock_init(&en->set.lock); + + ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); + return en; } - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest, true); return en; } @@ -401,17 +395,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - /* No locking required, only called during cleanup. */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblcr_expiration(struct ip_vs_service *svc) @@ -429,13 +427,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_after(en->lastuse + sysctl_lblcr_expiration(svc), now)) continue; @@ -443,7 +442,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -467,7 +466,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -485,11 +485,11 @@ static void ip_vs_lblcr_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -497,7 +497,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -515,11 +515,10 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - pr_err("%s(): no memory\n", __func__); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) return -ENOMEM; - } + svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(*tbl)); @@ -527,12 +526,13 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -545,7 +545,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; @@ -553,14 +553,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); + ip_vs_lblcr_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -582,7 +580,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -598,13 +596,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -632,7 +630,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -647,65 +645,56 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_lblcr_table *tbl = svc->sched_data; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest = NULL; + struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); + en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); if (en) { - /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* Get the least loaded destination */ - read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + + time_after(jiffies, en->set.lastmod + sysctl_lblcr_expiration(svc))) { - struct ip_vs_dest *m; + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); + if (dest && !is_overloaded(dest, svc)) goto out; - } /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); - read_unlock(&svc->sched_lock); return NULL; } /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); goto out; + } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); @@ -715,13 +704,14 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); return dest; @@ -750,20 +740,26 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblcr_ctl_table == NULL) return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblcr_ctl_table[0].procname = NULL; } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; ipvs->lblcr_ctl_header = - register_net_sysctl_table(net, net_vs_ctl_path, - ipvs->lblcr_ctl_table); + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); @@ -813,6 +809,7 @@ static void __exit ip_vs_lblcr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); unregister_pernet_subsys(&ip_vs_lblcr_ops); + rcu_barrier(); } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index f391819c0cc..2bdcb1cf212 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -26,7 +26,8 @@ * Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; unsigned int loh = 0, doh; @@ -42,7 +43,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * served, but no new connection is assigned to the server. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->weight) == 0) continue; @@ -84,6 +85,7 @@ static int __init ip_vs_lc_init(void) static void __exit ip_vs_lc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); } module_init(ip_vs_lc_init); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index f454c80df0a..5882bbfd198 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -19,8 +19,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * * Authors: @@ -63,6 +62,7 @@ #include <net/ip_vs.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_zones.h> @@ -82,7 +82,7 @@ void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); struct nf_conntrack_tuple new_tuple; if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || @@ -97,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; + /* Applications may adjust TCP seqs */ + if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && + !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) + return; + /* * The connection is not yet in the hashtable, so we update it. * CIP->VIP will remain the same, so leave the tuple in @@ -127,7 +132,7 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) nf_conntrack_alter_reply(ct, &new_tuple); } -int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +int ip_vs_confirm_conntrack(struct sk_buff *skb) { return nf_conntrack_confirm(skb); } diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 984d9c137d8..961a6de9bb2 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -40,7 +40,7 @@ #include <net/ip_vs.h> -static inline unsigned int +static inline int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { /* @@ -55,10 +55,11 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; + int loh = 0, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -75,7 +76,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD || !atomic_read(&dest->weight)) @@ -91,8 +92,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { + ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight))) { least = dest; loh = doh; } @@ -133,6 +134,7 @@ static int __init ip_vs_nq_init(void) static void __exit ip_vs_nq_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); } module_init(ip_vs_nq_init); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 5cf859ccb31..1a82b29ce8e 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -13,20 +13,8 @@ /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_pe_lock); - -/* Bind a service with a pe */ -void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) -{ - svc->pe = pe; -} - -/* Unbind a service from its pe */ -void ip_vs_unbind_pe(struct ip_vs_service *svc) -{ - svc->pe = NULL; -} +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) @@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); - spin_lock_bh(&ip_vs_pe_lock); - - list_for_each_entry(pe, &ip_vs_pe, n_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { @@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) } if (strcmp(pe_name, pe->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_pe_lock); + rcu_read_unlock(); return pe; } if (pe->module) module_put(pe->module); } + rcu_read_unlock(); - spin_unlock_bh(&ip_vs_pe_lock); return NULL; } @@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_pe_lock); - - if (!list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - ip_vs_use_count_dec(); - pr_err("%s(): [%s] pe already linked\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { - spin_unlock_bh(&ip_vs_pe_lock); + mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); @@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) } } /* Add it into the d-linked pe list */ - list_add(&pe->n_list, &ip_vs_pe); - spin_unlock_bh(&ip_vs_pe_lock); + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); @@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { - spin_lock_bh(&ip_vs_pe_lock); - if (list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - pr_err("%s(): [%s] pe is not in the list. failed\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ - list_del(&pe->n_list); - spin_unlock_bh(&ip_vs_pe_lock); + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 13d607ae9c5..bed5f704252 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, const char *callid, size_t callid_len, int *idx) { - size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); memcpy(buf + *idx, callid, len); buf[*idx+len] = '\0'; *idx += len + 1; @@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff, if (ret > 0) break; if (!ret) - return 0; + return -EINVAL; dataoff += *matchoff; } - /* Empty callid is useless */ - if (!*matchlen) - return -EINVAL; - /* Too large is useless */ if (*matchlen > IP_VS_PEDATA_MAXLEN) return -EINVAL; @@ -73,18 +70,20 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) const char *dptr; int retc; - ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(p->af, skb, &iph); /* Only useful with UDP */ if (iph.protocol != IPPROTO_UDP) return -EINVAL; - - /* No Data ? */ + /* todo: IPv6 fragments: + * I think this only should be done for the first fragment. /HS + */ dataoff = iph.len + sizeof(struct udphdr); + if (dataoff >= skb->len) return -EINVAL; - - if ((retc=skb_linearize(skb)) < 0) + retc = skb_linearize(skb); + if (retc < 0) return retc; dptr = skb->data + dataoff; datalen = skb->len - dataoff; @@ -108,7 +107,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct) { - bool ret = 0; + bool ret = false; if (ct->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && @@ -121,7 +120,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, ct->protocol == p->protocol && ct->pe_data && ct->pe_data_len == p->pe_data_len && !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) - ret = 1; + ret = true; IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -164,6 +163,7 @@ static int __init ip_vs_sip_init(void) static void __exit ip_vs_sip_cleanup(void) { unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); } module_init(ip_vs_sip_init); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index eb86028536f..939f7fbe9b4 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -25,7 +25,6 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> -#include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> @@ -49,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; @@ -60,9 +59,6 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } -#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \ - defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \ - defined(CONFIG_IP_VS_PROTO_ESP) /* * register an ipvs protocols netns related data */ @@ -70,25 +66,30 @@ static int register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) { struct netns_ipvs *ipvs = net_ipvs(net); - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = - kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); - if (!pd) { - pr_err("%s(): no memory.\n", __func__); + if (!pd) return -ENOMEM; - } + pd->pp = pp; /* For speed issues */ pd->next = ipvs->proto_data_table[hash]; ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ - if (pp->init_netns != NULL) - pp->init_netns(net, pd); + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } return 0; } -#endif /* * unregister an ipvs protocol @@ -96,7 +97,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { @@ -119,7 +120,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data **pd_p; - unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { @@ -141,7 +142,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) @@ -155,11 +156,11 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -struct ip_vs_proto_data * +static struct ip_vs_proto_data * __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) @@ -198,7 +199,7 @@ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) int * ip_vs_create_timeout_table(int *table, int size) { - return kmemdup(table, size, GFP_ATOMIC); + return kmemdup(table, size, GFP_KERNEL); } @@ -279,17 +280,17 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) - sprintf(buf, "%pI6->%pI6 frag", &ih->saddr, &ih->daddr); + sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) - sprintf(buf, "TRUNCATED %pI6->%pI6", + sprintf(buf, "TRUNCATED %pI6c->%pI6c", &ih->saddr, &ih->daddr); else - sprintf(buf, "%pI6:%u->%pI6:%u", + sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } @@ -316,27 +317,40 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -int __net_init __ip_vs_protocol_init(struct net *net) +int __net_init ip_vs_protocol_net_init(struct net *net) { + int i, ret; + static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP - register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); + &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP - register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); + &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP - register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); + &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH - register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); + &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP - register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); + &ip_vs_protocol_esp, #endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; } -void __net_exit __ip_vs_protocol_cleanup(struct net *net) +void __net_exit ip_vs_protocol_net_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 5b8eb8b12c3..5de3dd312c0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -57,7 +57,7 @@ ah_esp_conn_fill_param_proto(struct net *net, int af, static struct ip_vs_conn * ah_esp_conn_in_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, unsigned int proto_off, + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn *cp; @@ -85,9 +85,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, static struct ip_vs_conn * ah_esp_conn_out_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; @@ -110,7 +108,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { /* * AH/ESP is only related traffic. Pass the packet to IP stack. diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index d12ed53ec95..2f7ea756404 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -10,36 +10,42 @@ static int sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { struct net *net; struct ip_vs_service *svc; + struct netns_ipvs *ipvs; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; - struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph); - if (sh == NULL) + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh == NULL) { + *verdict = NF_DROP; return 0; + } - sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t), + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), sizeof(_schunkh), &_schunkh); - if (sch == NULL) + if (sch == NULL) { + *verdict = NF_DROP; return 0; + } + net = skb_net(skb); - if ((sch->type == SCTP_CID_INIT) && - (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, - &iph.daddr, sh->dest))) { + ipvs = net_ipvs(net); + rcu_read_lock(); + if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, sh->dest))) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -47,106 +53,119 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pd); - else { - ip_vs_service_put(svc); + *verdict = ip_vs_leave(svc, skb, pd, iph); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } +static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph, + unsigned int sctphoff) +{ + sctph->checksum = sctp_compute_cksum(skb, sctphoff); + skb->ip_summed = CHECKSUM_UNNECESSARY; +} + static int -sctp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { sctp_sctphdr_t *sctph; - unsigned int sctphoff; - struct sk_buff *iter; - __be32 crc32; + unsigned int sctphoff = iph->len; + bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - sctphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - sctphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + ret = ip_vs_app_pkt_out(cp, skb); + if (ret == 0) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; - sctph->source = cp->vport; - /* Calculate the checksum */ - crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); - skb_walk_frags(skb, iter) - crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), - crc32); - crc32 = sctp_end_cksum(crc32); - sctph->checksum = crc32; + /* Only update csum if we really have to */ + if (sctph->source != cp->vport || payload_csum || + skb->ip_summed == CHECKSUM_PARTIAL) { + sctph->source = cp->vport; + sctp_nat_csum(skb, sctph, sctphoff); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } return 1; } static int -sctp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { sctp_sctphdr_t *sctph; - unsigned int sctphoff; - struct sk_buff *iter; - __be32 crc32; + unsigned int sctphoff = iph->len; + bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - sctphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - sctphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_in(cp, skb)) + ret = ip_vs_app_pkt_in(cp, skb); + if (ret == 0) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; - sctph->dest = cp->dport; - /* Calculate the checksum */ - crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); - skb_walk_frags(skb, iter) - crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), - crc32); - crc32 = sctp_end_cksum(crc32); - sctph->checksum = crc32; + /* Only update csum if we really have to */ + if (sctph->dest != cp->dport || payload_csum || + (skb->ip_summed == CHECKSUM_PARTIAL && + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { + sctph->dest = cp->dport; + sctp_nat_csum(skb, sctph, sctphoff); + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } return 1; } @@ -156,10 +175,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int sctphoff; struct sctphdr *sh, _sctph; - struct sk_buff *iter; - __le32 cmp; - __le32 val; - __u32 tmp; + __le32 cmp, val; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) @@ -173,13 +189,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 0; cmp = sh->checksum; - - tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); - skb_walk_frags(skb, iter) - tmp = sctp_update_cksum((__u8 *) iter->data, - skb_headlen(iter), tmp); - - val = sctp_end_cksum(tmp); + val = sctp_compute_cksum(skb, sctphoff); if (val != cmp) { /* CRC failure, dump it. */ @@ -190,710 +200,159 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 1; } -struct ipvs_sctp_nextstate { - int next_state; -}; enum ipvs_sctp_event_t { - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_SER, - IP_VS_SCTP_EVE_INIT_CLI, - IP_VS_SCTP_EVE_INIT_SER, - IP_VS_SCTP_EVE_INIT_ACK_CLI, - IP_VS_SCTP_EVE_INIT_ACK_SER, - IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, - IP_VS_SCTP_EVE_COOKIE_ECHO_SER, - IP_VS_SCTP_EVE_COOKIE_ACK_CLI, - IP_VS_SCTP_EVE_COOKIE_ACK_SER, - IP_VS_SCTP_EVE_ABORT_CLI, - IP_VS_SCTP_EVE__ABORT_SER, - IP_VS_SCTP_EVE_SHUT_CLI, - IP_VS_SCTP_EVE_SHUT_SER, - IP_VS_SCTP_EVE_SHUT_ACK_CLI, - IP_VS_SCTP_EVE_SHUT_ACK_SER, - IP_VS_SCTP_EVE_SHUT_COM_CLI, - IP_VS_SCTP_EVE_SHUT_COM_SER, - IP_VS_SCTP_EVE_LAST + IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ + IP_VS_SCTP_INIT, + IP_VS_SCTP_INIT_ACK, + IP_VS_SCTP_COOKIE_ECHO, + IP_VS_SCTP_COOKIE_ACK, + IP_VS_SCTP_SHUTDOWN, + IP_VS_SCTP_SHUTDOWN_ACK, + IP_VS_SCTP_SHUTDOWN_COMPLETE, + IP_VS_SCTP_ERROR, + IP_VS_SCTP_ABORT, + IP_VS_SCTP_EVENT_LAST }; -static enum ipvs_sctp_event_t sctp_events[255] = { - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_INIT_CLI, - IP_VS_SCTP_EVE_INIT_ACK_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_ABORT_CLI, - IP_VS_SCTP_EVE_SHUT_CLI, - IP_VS_SCTP_EVE_SHUT_ACK_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, - IP_VS_SCTP_EVE_COOKIE_ACK_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_SHUT_COM_CLI, +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { + [SCTP_CID_DATA] = IP_VS_SCTP_DATA, + [SCTP_CID_INIT] = IP_VS_SCTP_INIT, + [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, + [SCTP_CID_SACK] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, + [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, + [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, + [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, + [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, + [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, + [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, + [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, + [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, + [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, }; -static struct ipvs_sctp_nextstate - sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = { - /* - * STATE : IP_VS_SCTP_S_NONE - */ - /*next state *//*event */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }, - }, - /* - * STATE : IP_VS_SCTP_S_INIT_CLI - * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT) - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_INIT_SER - * Server sent INIT and waiting for INIT ACK from the client - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_INIT_ACK_CLI - * Client sent INIT ACK and waiting for ECHO from the server - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK has been resent by the client, let us stay is in - * the same state - */ - {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK sent by the server, close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * ECHO by client, it should not happen, close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO by server, this is what we are expecting, move to ECHO_SER - */ - {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, it should not happen, close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * Unexpected COOKIE ACK from server, staty in the same state - */ - {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_INIT_ACK_SER - * Server sent INIT ACK and waiting for ECHO from the client - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * Unexpected INIT_ACK by the client, let us close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK resent by the server, let us move to same state - */ - {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client send the ECHO, this is what we are expecting, - * move to ECHO_CLI - */ - {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO received from the server, Not sure what to do, - * let us close it - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, let us stay in the same state - */ - {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * COOKIE ACK from server, hmm... this should not happen, lets close - * the connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_ECHO_CLI - * Cient sent ECHO and waiting COOKEI ACK from the Server - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK has been by the client, let us close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client resent the ECHO, let us stay in the same state - */ - {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO received from the server, Not sure what to do, - * let us close it - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, this shoud not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * COOKIE ACK from server, this is what we are awaiting,lets move to - * ESTABLISHED. - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_ECHO_SER - * Server sent ECHO and waiting COOKEI ACK from the client - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK has been by the server, let us close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent the ECHO, not sure what to do, let's close the - * connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO resent by the server, stay in the same state - */ - {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, this is what we are expecting, let's move - * to ESTABLISHED. - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * COOKIE ACK from server, this should not happen, lets close the - * connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_ESTABLISHED - * Association established - */ - {{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this should not happen, let's close - * the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_SHUT_CLI - * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ - - {{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN resent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this should not happen, let's close - * the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server sent SHUTDOWN ACK, this is what we are expecting, let's move - * to SHUDOWN_ACK_SER - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_SHUT_SER - * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ - - {{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN resent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN resent from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this is what we are expecting, let's - * move to SHUT_ACK_CLI - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server sent SHUTDOWN ACK, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - - /* - * State : IP_VS_SCTP_S_SHUT_ACK_CLI - * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ - - {{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN sent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN sent from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client resent SHUDTDOWN_ACK, let's stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server sent SHUTDOWN ACK, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - /* - * SHUTDOWN COMPLETE from server this is what we are expecting. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - - /* - * State : IP_VS_SCTP_S_SHUT_ACK_SER - * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ - {{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN sent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN sent from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this should not happen let's close - * the connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server resent SHUTDOWN ACK, stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this what we are expecting, let's close - * the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - /* - * SHUTDOWN COMPLETE from server this should not happen. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_CLOSED - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - } +#define sNO IP_VS_SCTP_S_NONE +#define sI1 IP_VS_SCTP_S_INIT1 +#define sIN IP_VS_SCTP_S_INIT +#define sCS IP_VS_SCTP_S_COOKIE_SENT +#define sCR IP_VS_SCTP_S_COOKIE_REPLIED +#define sCW IP_VS_SCTP_S_COOKIE_WAIT +#define sCO IP_VS_SCTP_S_COOKIE +#define sCE IP_VS_SCTP_S_COOKIE_ECHOED +#define sES IP_VS_SCTP_S_ESTABLISHED +#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT +#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ IP_VS_SCTP_S_REJECTED +#define sCL IP_VS_SCTP_S_CLOSED + +static const __u8 sctp_states + [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { + { /* INPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* OUTPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, +/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, +/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* INPUT-ONLY */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, }; -/* - * Timeout table[state] - */ +#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) + +/* Timeout table[state] */ static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { - [IP_VS_SCTP_S_NONE] = 2 * HZ, - [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_INIT_ACK_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_INIT_ACK_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_ECHO_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_ECHO_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_ACK_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_ACK_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_CLOSED] = 10 * HZ, - [IP_VS_SCTP_S_LAST] = 2 * HZ, + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_LAST] = 2 * HZ, }; static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { - [IP_VS_SCTP_S_NONE] = "NONE", - [IP_VS_SCTP_S_INIT_CLI] = "INIT_CLI", - [IP_VS_SCTP_S_INIT_SER] = "INIT_SER", - [IP_VS_SCTP_S_INIT_ACK_CLI] = "INIT_ACK_CLI", - [IP_VS_SCTP_S_INIT_ACK_SER] = "INIT_ACK_SER", - [IP_VS_SCTP_S_ECHO_CLI] = "COOKIE_ECHO_CLI", - [IP_VS_SCTP_S_ECHO_SER] = "COOKIE_ECHO_SER", - [IP_VS_SCTP_S_ESTABLISHED] = "ESTABISHED", - [IP_VS_SCTP_S_SHUT_CLI] = "SHUTDOWN_CLI", - [IP_VS_SCTP_S_SHUT_SER] = "SHUTDOWN_SER", - [IP_VS_SCTP_S_SHUT_ACK_CLI] = "SHUTDOWN_ACK_CLI", - [IP_VS_SCTP_S_SHUT_ACK_SER] = "SHUTDOWN_ACK_SER", - [IP_VS_SCTP_S_CLOSED] = "CLOSED", - [IP_VS_SCTP_S_LAST] = "BUG!" + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT1] = "INIT1", + [IP_VS_SCTP_S_INIT] = "INIT", + [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", + [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", + [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", + [IP_VS_SCTP_S_COOKIE] = "COOKIE", + [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", + [IP_VS_SCTP_S_REJECTED] = "REJECTED", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!", }; @@ -906,14 +365,14 @@ static const char *sctp_state_name(int state) return "?"; } -static inline int +static inline void set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) { sctp_chunkhdr_t _sctpch, *sch; unsigned char chunk_type; int event, next_state; - int ihl; + int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -921,10 +380,10 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ihl = ip_hdrlen(skb); #endif - sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), - sizeof(_sctpch), &_sctpch); + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) - return 0; + return; chunk_type = sch->type; /* @@ -940,25 +399,30 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { - sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + - sch->length), sizeof(_sctpch), &_sctpch); - if (sch) { - if (sch->type == SCTP_CID_ABORT) + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } - event = sctp_events[chunk_type]; + event = (chunk_type < sizeof(sctp_events)) ? + sctp_events[chunk_type] : IP_VS_SCTP_DATA; - /* - * If the direction is IP_VS_DIR_OUTPUT, this event is from server - */ - if (direction == IP_VS_DIR_OUTPUT) - event++; - /* - * get next state + /* Update direction to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected */ - next_state = sctp_states_table[cp->state][event].next_state; + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (direction == IP_VS_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + direction = IP_VS_DIR_INPUT_ONLY; + } + + next_state = sctp_states[direction][event][cp->state]; if (next_state != cp->state) { struct ip_vs_dest *dest = cp->dest; @@ -993,21 +457,15 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->timeout = pd->timeout_table[cp->state = next_state]; else /* What to do ? */ cp->timeout = sctp_timeouts[cp->state = next_state]; - - return 1; } -static int +static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { - int ret = 0; - - spin_lock(&cp->lock); - ret = set_sctp_state(pd, cp, direction, skb); - spin_unlock(&cp->lock); - - return ret; + spin_lock_bh(&cp->lock); + set_sctp_state(pd, cp, direction, skb); + spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) @@ -1027,30 +485,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc) hash = sctp_app_hashkey(port); - spin_lock_bh(&ipvs->sctp_app_lock); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->sctp_app_lock); return ret; } static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); - spin_lock_bh(&ipvs->sctp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->sctp_app_lock); + list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) @@ -1066,12 +519,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - spin_lock(&ipvs->sctp_app_lock); - list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -1087,7 +540,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); out: return result; } @@ -1096,14 +549,16 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); - spin_lock_init(&ipvs->sctp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index c0cc341b840..e3a697234a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -33,33 +33,34 @@ static int tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; - struct ip_vs_iphdr iph; + struct netns_ipvs *ipvs; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } net = skb_net(skb); + ipvs = net_ipvs(net); /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ - if (th->syn && - (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, - &iph.daddr, th->dest))) { + rcu_read_lock(); + if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, th->dest))) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -68,18 +69,17 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pd); - else { - ip_vs_service_put(svc); + *verdict = ip_vs_leave(svc, skb, pd, iph); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -128,20 +128,18 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph, static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; - unsigned int tcphoff; + unsigned int tcphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - tcphoff = ip_hdrlen(skb); oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ @@ -208,20 +206,18 @@ tcp_snat_handler(struct sk_buff *skb, static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; - unsigned int tcphoff; + unsigned int tcphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - tcphoff = ip_hdrlen(skb); oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ @@ -407,7 +403,7 @@ static struct tcp_states_t tcp_states [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ @@ -421,7 +417,7 @@ static struct tcp_states_t tcp_states [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; @@ -430,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ @@ -444,7 +440,7 @@ static struct tcp_states_t tcp_states_dos [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; @@ -546,7 +542,7 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, /* * Handle state transitions */ -static int +static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) @@ -561,13 +557,11 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) - return 0; + return; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); - spin_unlock(&cp->lock); - - return 1; + spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) @@ -588,18 +582,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) hash = tcp_app_hashkey(port); - spin_lock_bh(&ipvs->tcp_app_lock); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->tcp_app_lock); return ret; } @@ -607,13 +599,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock_bh(&ipvs->tcp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->tcp_app_lock); + list_del_rcu(&inc->p_list); } @@ -632,12 +621,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&ipvs->tcp_app_lock); - list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -654,7 +643,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); out: return result; @@ -668,26 +657,28 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); - spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; pd->tcp_state_table = tcp_states; + return 0; } static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f1282cbe6fe..b62a3c0ff9b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -30,23 +30,23 @@ static int udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; - struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } net = skb_net(skb); - svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, - &iph.daddr, uh->dest); + rcu_read_lock(); + svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, uh->dest); if (svc) { int ignored; @@ -55,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -64,18 +64,17 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pd); - else { - ip_vs_service_put(svc); + *verdict = ip_vs_leave(svc, skb, pd, iph); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -125,20 +124,18 @@ udp_partial_csum_update(int af, struct udphdr *uhdr, static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; - unsigned int udphoff; + unsigned int udphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - udphoff = ip_hdrlen(skb); oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ @@ -210,20 +207,18 @@ udp_snat_handler(struct sk_buff *skb, static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; - unsigned int udphoff; + unsigned int udphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - udphoff = ip_hdrlen(skb); oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ @@ -364,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) hash = udp_app_hashkey(port); - - spin_lock_bh(&ipvs->udp_app_lock); list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->udp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->udp_app_lock); return ret; } @@ -385,12 +377,9 @@ static void udp_unregister_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); - struct netns_ipvs *ipvs = net_ipvs(net); - spin_lock_bh(&ipvs->udp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->udp_app_lock); + list_del_rcu(&inc->p_list); } @@ -408,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&ipvs->udp_app_lock); - list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -430,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); out: return result; @@ -454,28 +443,29 @@ static const char * udp_state_name(int state) return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; } -static int +static void udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { if (unlikely(!pd)) { pr_err("UDP no ns data\n"); - return 0; + return; } cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; } -static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); - spin_lock_init(&ipvs->udp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index c49b388d108..176b87c35e3 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) { - svc->sched_data = &svc->destinations; + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -46,38 +55,44 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc) * Round-Robin Scheduling */ static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { - struct list_head *p, *q; - struct ip_vs_dest *dest; + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), @@ -94,7 +109,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, .schedule = ip_vs_rr_schedule, }; @@ -106,6 +122,7 @@ static int __init ip_vs_rr_init(void) static void __exit ip_vs_rr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); } module_init(ip_vs_rr_init); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 08dbdd5bc18..4dbcda6258b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err); */ static LIST_HEAD(ip_vs_schedulers); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_sched_lock); +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); /* @@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, { int ret; - svc->scheduler = scheduler; - if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { @@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, return ret; } } - + rcu_assign_pointer(svc->scheduler, scheduler); return 0; } @@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, /* * Unbind a service with its scheduler */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) { - struct ip_vs_scheduler *sched = svc->scheduler; + struct ip_vs_scheduler *cur_sched; - if (!sched) - return 0; + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - pr_err("%s(): done error\n", __func__); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can not be set to NULL */ } @@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return sched; } if (sched->module) module_put(sched->module); } - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return NULL; } @@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - svc->scheduler->name, svc->fwmark, - svc->fwmark, msg); + sched->name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { - IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } @@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (!list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 89ead246ed3..e446b9fa742 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -44,7 +44,7 @@ #include <net/ip_vs.h> -static inline unsigned int +static inline int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { /* @@ -59,10 +59,11 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -79,7 +80,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -94,12 +95,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -134,6 +135,7 @@ static int __init ip_vs_sed_init(void) static void __exit ip_vs_sed_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); } module_init(ip_vs_sed_init); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index b5e2556c581..cc65b2f42cd 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -30,6 +30,11 @@ * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * + * The weight destination attribute can be used to control the + * distribution of connections to the destinations in servernode. The + * greater the weight, the more connections the destination + * will receive. + * */ #define KMSG_COMPONENT "IPVS" @@ -43,12 +48,16 @@ #include <net/ip_vs.h> +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> + /* * IPVS SH bucket */ struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -61,11 +70,24 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) +struct ip_vs_sh_state { + struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; +}; + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} /* * Returns hash value for IPVS SH entry */ -static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; @@ -74,7 +96,8 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; } @@ -82,38 +105,102 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) { - return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; } +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* first try the dest it's supposed to go to */ + ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + dest = rcu_dereference(s->buckets[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + /* if the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; + hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, "SH: selected unavailable " + "server %s:%d (offset %d), reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + /* * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + int d_count; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); + d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); + + IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", + i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + atomic_read(&dest->weight)); + + /* Don't move to next dest until filling weight */ + if (++d_count >= atomic_read(&dest->weight)) { + p = p->next; + d_count = 0; + } - p = p->next; } b++; } @@ -124,16 +211,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -142,64 +231,84 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - pr_err("%s(): no memory\n", __func__); + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - } - svc->sched_data = tbl; + + svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); return 0; } -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; + struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + ip_vs_sh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; } -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + ip_vs_sh_reassign(s, svc); return 0; } -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - return dest->flags & IP_VS_DEST_F_OVERLOAD; + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (unlikely(th == NULL)) + return 0; + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (unlikely(uh == NULL)) + return 0; + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(sh == NULL)) + return 0; + port = sh->source; + break; + default: + port = 0; + } + + return port; } @@ -207,28 +316,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest) * Source Hashing scheduling */ static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + struct ip_vs_sh_state *s; + __be16 port = 0; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + + s = (struct ip_vs_sh_state *) svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + else + dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + + if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), + IP_VS_DBG_ADDR(svc->af, &iph->saddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); @@ -247,7 +360,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; @@ -261,6 +376,7 @@ static int __init ip_vs_sh_init(void) static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index e292e5bddc7..db801263ee9 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -61,6 +61,7 @@ #define SYNC_PROTO_VER 1 /* Protocol version in header */ +static struct lock_class_key __ipvs_sync_key; /* * IPVS sync connection entry * Version 0, i.e. original version. @@ -195,6 +196,7 @@ struct ip_vs_sync_thread_data { struct net *net; struct socket *sock; char *buf; + int id; }; /* Version 0 definition of packet sizes */ @@ -244,7 +246,7 @@ struct ip_vs_sync_thread_data { struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; - __u16 size; + __be16 size; /* ip_vs_sync_conn entries start here */ }; @@ -253,7 +255,7 @@ struct ip_vs_sync_mesg_v0 { struct ip_vs_sync_mesg { __u8 reserved; /* must be zero */ __u8 syncid; - __u16 size; + __be16 size; __u8 nr_conns; __s8 version; /* SYNC_PROTO_VER */ __u16 spare; @@ -270,13 +272,6 @@ struct ip_vs_sync_buff { unsigned char *end; }; -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), -}; - /* * Copy of struct ip_vs_seq * From unaligned network order to aligned host order @@ -299,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) put_unaligned_be32(ho->previous_delta, &no->previous_delta); } -static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_lock); - if (list_empty(&ipvs->sync_queue)) { + if (list_empty(&ms->sync_queue)) { sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); } else { - sb = list_entry(ipvs->sync_queue.next, - struct ip_vs_sync_buff, + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; } spin_unlock_bh(&ipvs->sync_lock); @@ -333,10 +332,10 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs) kfree(sb); return NULL; } - sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ipvs->master_syncid; - sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); @@ -352,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct netns_ipvs *ipvs) +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) { - struct ip_vs_sync_buff *sb = ipvs->sync_buff; + struct ip_vs_sync_buff *sb = ms->sync_buff; spin_lock(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ipvs->sync_queue); - else + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) + wake_up_process(ms->master_thread); + } else ip_vs_sync_buff_release(sb); spin_unlock(&ipvs->sync_lock); } @@ -369,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_buff_lock); - if (ipvs->sync_buff && - time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { - sb = ipvs->sync_buff; - ipvs->sync_buff = NULL; + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); } else sb = NULL; spin_unlock_bh(&ipvs->sync_buff_lock); return sb; } -/* - * Switch mode from sending version 0 or 1 - * - must handle sync_buf - */ -void ip_vs_sync_switch_mode(struct net *net, int mode) +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(net); - - if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) - return; - if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) - return; - - spin_lock_bh(&ipvs->sync_buff_lock); - /* Buffer empty ? then let buf_create do the job */ - if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { - kfree(ipvs->sync_buff); - ipvs->sync_buff = NULL; - } else { - spin_lock_bh(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&ipvs->sync_buff->list, - &ipvs->sync_queue); - else - ip_vs_sync_buff_release(ipvs->sync_buff); - spin_unlock_bh(&ipvs->sync_lock); - } - spin_unlock_bh(&ipvs->sync_buff_lock); + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; } /* @@ -434,22 +418,121 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; mesg->syncid = ipvs->master_syncid; - mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; sb->firstuse = jiffies; return sb; } +/* Check if connection is controlled by persistence */ +static inline bool in_persistence(struct ip_vs_conn *cp) +{ + for (cp = cp->control; cp; cp = cp->control) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + return true; + } + return false; +} + +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) + return 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | + (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | + (1 << IP_VS_SCTP_S_CLOSED)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (sync_refresh_period <= 0 && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} + /* * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, + int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; int len; if (unlikely(cp->af != AF_INET)) @@ -458,21 +541,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return; - spin_lock(&ipvs->sync_buff_lock); - if (!ipvs->sync_buff) { - ipvs->sync_buff = - ip_vs_sync_buff_create_v0(ipvs); - if (!ipvs->sync_buff) { - spin_unlock(&ipvs->sync_buff_lock); + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (!m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; } len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; - s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; /* copy members */ s->reserved = 0; @@ -492,19 +595,25 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) } m->nr_conns++; - m->size += len; - ipvs->sync_buff->head += len; + m->size = htons(ntohs(m->size) + len); + buff->head += len; /* check if there is a space for next one */ - if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + if (buff->head + FULL_CONN_SIZE > buff->end) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(net, cp->control); + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(net, cp->control, pkts); + } } /* @@ -512,23 +621,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; __u8 *p; unsigned int len, pe_name_len, pad; /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp); + ip_vs_sync_conn_v0(net, cp, pkts); return; } /* Do not sync ONE PACKET */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) goto control; sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + /* Sanity checks */ pe_name_len = 0; if (cp->pe_data_len) { @@ -539,7 +654,14 @@ sloop: pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); } - spin_lock(&ipvs->sync_buff_lock); + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -558,28 +680,33 @@ sloop: /* check if there is a space for this one */ pad = 0; - if (ipvs->sync_buff) { - pad = (4 - (size_t)ipvs->sync_buff->head) & 3; - if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; pad = 0; } } - if (!ipvs->sync_buff) { - ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); - if (!ipvs->sync_buff) { - spin_unlock(&ipvs->sync_buff_lock); + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; + m = buff->mesg; } - m = ipvs->sync_buff->mesg; - p = ipvs->sync_buff->head; - ipvs->sync_buff->head += pad + len; - m->size += pad + len; + p = buff->head; + buff->head += pad + len; + m->size = htons(ntohs(m->size) + pad + len); /* Add ev. padding from prev. sync_conn */ while (pad--) *(p++) = 0; @@ -602,9 +729,9 @@ sloop: #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) { p += sizeof(struct ip_vs_sync_v6); - ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6); - ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6); - ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6); + s->v6.caddr = cp->caddr.in6; + s->v6.vaddr = cp->vaddr.in6; + s->v6.daddr = cp->daddr.in6; } else #endif { @@ -636,23 +763,17 @@ sloop: } } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); control: /* synchronize its controller if it has */ cp = cp->control; if (!cp) return; - /* - * Reduce sync rate for templates - * i.e only increment in_pkts for Templates. - */ - if (cp->flags & IP_VS_CONN_F_TEMPLATE) { - int pkts = atomic_add_return(1, &cp->in_pkts); - - if (pkts % sysctl_sync_period(ipvs) != 1) - return; - } + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); goto sloop; } @@ -730,66 +851,46 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, else cp = ip_vs_ct_in_get(param); - if (cp && param->pe_data) /* Free pe_data */ + if (cp) { + /* Free pe_data */ kfree(param->pe_data); - if (!cp) { + + dest = cp->dest; + spin_lock_bh(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); + } else { /* * Find the appropriate destination for the connection. * If it is not found the connection will remain unbound * but still handled. */ + rcu_read_lock(); dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark); - - /* Set the approprite ativity flag */ - if (protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } + param->vport, protocol, fwmark, flags); + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); - if (dest) - atomic_dec(&dest->refcnt); + rcu_read_unlock(); if (!cp) { if (param->pe_data) kfree(param->pe_data); IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } } if (opt) @@ -838,7 +939,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); for (i=0; i<m->nr_conns; i++) { - unsigned flags, state; + unsigned int flags, state; if (p + SIMPLE_CONN_SIZE > buffer+buflen) { IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); @@ -1087,10 +1188,8 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, IP_VS_DBG(2, "BACKUP, message header too short\n"); return; } - /* Convert size back to host byte order */ - m2->size = ntohs(m2->size); - if (buflen != m2->size) { + if (buflen != ntohs(m2->size)) { IP_VS_DBG(2, "BACKUP, bogus message size\n"); return; } @@ -1108,7 +1207,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, for (i=0; i<nr_conns; i++) { union ip_vs_sync_conn *s; - unsigned size; + unsigned int size; int retc; p = msg_end; @@ -1148,6 +1247,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, /* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + sysctl_wmem_max); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + sysctl_rmem_max); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* * Setup loopback of outgoing multicasts on a sending socket */ static void set_mcast_loop(struct sock *sk, u_char loop) @@ -1297,9 +1418,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net) +static struct socket *make_send_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1323,6 +1450,9 @@ static struct socket *make_send_sock(struct net *net) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { @@ -1348,9 +1478,15 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net) +static struct socket *make_receive_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1367,7 +1503,10 @@ static struct socket *make_receive_sock(struct net *net) */ sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; + sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, sizeof(struct sockaddr)); @@ -1410,18 +1549,19 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) return len; } -static void +static int ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) { int msize; + int ret; - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); + msize = ntohs(msg->size); - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - pr_err("ip_vs_send_async error\n"); + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; } static int @@ -1437,48 +1577,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) iov.iov_base = buffer; iov.iov_len = (size_t)buflen; - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); if (len < 0) - return -1; + return len; LeaveFunction(7); return len; } +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ms->master_thread); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid); + "syncid = %d, id = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); - while (!kthread_should_stop()) { - while ((sb = sb_dequeue(ipvs))) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; } - - /* check if entries stay in ipvs->sync_buff for 2 seconds */ - sb = get_curr_sync_buff(ipvs, 2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + /* (Ab)use interruptible sleep to avoid increasing + * the load avg. + */ + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop()); + if (unlikely(kthread_should_stop())) + goto done; } - - schedule_timeout_interruptible(HZ); + ip_vs_sync_buff_release(sb); } +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + /* clean up the sync_buff queue */ - while ((sb = sb_dequeue(ipvs))) + while ((sb = sb_dequeue(ipvs, ms))) ip_vs_sync_buff_release(sb); + __set_current_state(TASK_RUNNING); /* clean up the current sync_buff */ - sb = get_curr_sync_buff(ipvs, 0); + sb = get_curr_sync_buff(ipvs, ms, 0); if (sb) ip_vs_sync_buff_release(sb); @@ -1497,8 +1679,8 @@ static int sync_thread_backup(void *data) int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid); + "syncid = %d, id = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1510,15 +1692,12 @@ static int sync_thread_backup(void *data) len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->recv_mesg_maxlen); if (len <= 0) { - pr_err("receiving message error\n"); + if (len != -EAGAIN) + pr_err("receiving message error\n"); break; } - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); ip_vs_process_message(tinfo->net, tinfo->buf, len); - local_bh_enable(); } } @@ -1534,85 +1713,142 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; + struct task_struct **array = NULL, *task; struct socket *sock; struct netns_ipvs *ipvs = net_ipvs(net); - char *name, *buf = NULL; + char *name; int (*threadfn)(void *data); + int id, count; int result = -ENOMEM; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; + if (state == IP_VS_STATE_MASTER) { - if (ipvs->master_thread) + if (ipvs->ms) return -EEXIST; strlcpy(ipvs->master_mcast_ifn, mcast_ifn, sizeof(ipvs->master_mcast_ifn)); ipvs->master_syncid = syncid; - realtask = &ipvs->master_thread; - name = "ipvs_master:%d"; + name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; - sock = make_send_sock(net); } else if (state == IP_VS_STATE_BACKUP) { - if (ipvs->backup_thread) + if (ipvs->backup_threads) return -EEXIST; strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, sizeof(ipvs->backup_mcast_ifn)); ipvs->backup_syncid = syncid; - realtask = &ipvs->backup_thread; - name = "ipvs_backup:%d"; + name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(net); } else { return -EINVAL; } - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; - set_sync_mesg_maxlen(net, state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } else { + array = kzalloc(count * sizeof(struct task_struct *), + GFP_KERNEL); + if (!array) + goto out; } + set_sync_mesg_maxlen(net, state); - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->net = net; - tinfo->sock = sock; - tinfo->buf = buf; + tinfo = NULL; + for (id = 0; id < count; id++) { + if (state == IP_VS_STATE_MASTER) + sock = make_send_sock(net, id); + else + sock = make_receive_sock(net, id); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto outtinfo; + } + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outsocket; + tinfo->net = net; + tinfo->sock = sock; + if (state == IP_VS_STATE_BACKUP) { + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto outtinfo; + } else { + tinfo->buf = NULL; + } + tinfo->id = id; - task = kthread_run(threadfn, tinfo, name, ipvs->gen); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + tinfo = NULL; + if (state == IP_VS_STATE_MASTER) + ipvs->ms[id].master_thread = task; + else + array[id] = task; } /* mark as active */ - *realtask = task; + + if (state == IP_VS_STATE_BACKUP) + ipvs->backup_threads = array; + spin_lock_bh(&ipvs->sync_buff_lock); ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); /* increase the module use count */ ip_vs_use_count_inc(); return 0; -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); outsocket: sk_release_kernel(sock->sk); + +outtinfo: + if (tinfo) { + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + } + count = id; + while (count-- > 0) { + if (state == IP_VS_STATE_MASTER) + kthread_stop(ipvs->ms[count].master_thread); + else + kthread_stop(array[count]); + } + kfree(array); + out: + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } return result; } @@ -1620,38 +1856,60 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + struct task_struct **array; + int id; int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!ipvs->master_thread) + if (!ipvs->ms) return -ESRCH; - pr_info("stopping master sync thread %d ...\n", - task_pid_nr(ipvs->master_thread)); - /* * The lock synchronizes with sb_queue_tail(), so that we don't * add sync buffers to the queue, when we are already in * progress of stopping the master sync daemon. */ - spin_lock_bh(&ipvs->sync_lock); + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ipvs->sync_lock); - retc = kthread_stop(ipvs->master_thread); - ipvs->master_thread = NULL; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ms->master_thread)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(ms->master_thread); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!ipvs->backup_thread) + if (!ipvs->backup_threads) return -ESRCH; - pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(ipvs->backup_thread)); - ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - retc = kthread_stop(ipvs->backup_thread); - ipvs->backup_thread = NULL; + array = ipvs->backup_threads; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(array[id])); + ret = kthread_stop(array[id]); + if (retc >= 0) + retc = ret; + } + kfree(array); + ipvs->backup_threads = NULL; } /* decrease the module use count */ @@ -1663,24 +1921,22 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -int __net_init __ip_vs_sync_init(struct net *net) +int __net_init ip_vs_sync_net_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - INIT_LIST_HEAD(&ipvs->sync_queue); + __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); - - ipvs->sync_mcast_addr.sin_family = AF_INET; - ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); - ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); return 0; } -void __ip_vs_sync_cleanup(struct net *net) +void ip_vs_sync_net_cleanup(struct net *net) { int retc; + struct netns_ipvs *ipvs = net_ipvs(net); + mutex_lock(&ipvs->sync_mutex); retc = stop_sync_thread(net, IP_VS_STATE_MASTER); if (retc && retc != -ESRCH) pr_err("Failed to stop Master Daemon\n"); @@ -1688,13 +1944,5 @@ void __ip_vs_sync_cleanup(struct net *net) retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); if (retc && retc != -ESRCH) pr_err("Failed to stop Backup Daemon\n"); -} - -int __init ip_vs_sync_init(void) -{ - return 0; -} - -void ip_vs_sync_cleanup(void) -{ + mutex_unlock(&ipvs->sync_mutex); } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index bc1bfc48a17..b5b4650d50a 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -31,10 +31,11 @@ * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); @@ -51,7 +52,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -66,12 +67,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -106,6 +107,7 @@ static int __init ip_vs_wlc_init(void) static void __exit ip_vs_wlc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); } module_init(ip_vs_wlc_init); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 1ef41f50723..0546cd572d6 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -29,14 +29,45 @@ #include <net/ip_vs.h> +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + /* * current destination pointer for weighted round-robin scheduling */ struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ + struct ip_vs_dest *cl; /* current dest or head */ int cw; /* current weight */ int mw; /* maximum weight */ int di; /* decreasing interval */ + struct rcu_head rcu_head; }; @@ -84,41 +115,45 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) /* * Allocate the mark variable for WRR scheduling */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - pr_err("%s(): no memory\n", __func__); + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); + if (mark == NULL) return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); + + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; svc->sched_data = mark; return 0; } -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) { + struct ip_vs_wrr_mark *mark = svc->sched_data; + /* * Release the mark variable */ - kfree(svc->sched_data); - - return 0; + kfree_rcu(mark, rcu_head); } -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { struct ip_vs_wrr_mark *mark = svc->sched_data; - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -127,82 +162,82 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) * Weighted Round-Robin Scheduling */ static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { - struct ip_vs_dest *dest; + struct ip_vs_dest *dest, *last, *stop = NULL; struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; + bool last_pass = false, restarted = false; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - ip_vs_scheduler_err(svc, - "no destination available: " - "no destinations present"); - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - ip_vs_scheduler_err(svc, - "no destination available"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - ip_vs_scheduler_err(svc, - "no destination available: " - "all destinations are overloaded"); - goto out; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; } } +found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + mark->cl = dest; out: - write_unlock(&svc->sched_lock); + spin_unlock_bh(&svc->sched_lock); return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; } @@ -213,7 +248,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, .schedule = ip_vs_wrr_schedule, }; @@ -225,6 +262,7 @@ static int __init ip_vs_wrr_init(void) static void __exit ip_vs_wrr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); } module_init(ip_vs_wrr_init); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index ee319a4338b..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -17,6 +17,8 @@ * - not all connections have destination server, for example, * connections in backup server when fwmark is used * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback * LOCAL_OUT rules: * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) * - skb->pkt_type is not set yet @@ -49,165 +51,253 @@ enum { IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to * local */ + IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ }; +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} + /* * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, - u32 dst_cookie) +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) { - struct dst_entry *old_dst; + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dest->dst_cookie = dst_cookie; - dst_release(old_dst); + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) { - struct dst_entry *dst = dest->dst_cache; + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; - if (!dst) + if (!dest_dst) return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && - dst->ops->check(dst, dest->dst_cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; + return dest_dst; +} + +static inline bool +__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) +{ + if (IP6CB(skb)->frag_max_size) { + /* frag_max_size tell us that, this packet have been + * defragmented by netfilter IPv6 conntrack module. + */ + if (IP6CB(skb)->frag_max_size > mtu) + return true; /* largest fragment violate MTU */ } - dst_hold(dst); - return dst; + else if (skb->len > mtu && !skb_is_gso(skb)) { + return true; /* Packet size violate MTU size */ + } + return false; +} + +/* Get route to daddr, update *saddr, optionally bind route to saddr */ +static struct rtable *do_output_route4(struct net *net, __be32 daddr, + int rt_mode, __be32 *saddr) +{ + struct flowi4 fl4; + struct rtable *rt; + int loop = 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; + +retry: + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + /* Invalid saddr ? */ + if (PTR_ERR(rt) == -EINVAL && *saddr && + rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { + *saddr = 0; + flowi4_update_output(&fl4, 0, 0, daddr, 0); + goto retry; + } + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); + return NULL; + } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + ip_rt_put(rt); + *saddr = fl4.saddr; + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + loop++; + goto retry; + } + *saddr = fl4.saddr; + return rt; } /* Get route to destination or remote server */ -static struct rtable * +static int __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr) + __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ struct rtable *ort; /* Original route */ - int local; + struct iphdr *iph; + __be16 df; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos))) { - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = dest->addr.ip; - fl4.flowi4_tos = rtos; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &dest->addr.ip); - return NULL; + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); + if (!rt) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); - dest->dst_saddr.ip = fl4.saddr; - IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " - "rtos=%X\n", - &dest->addr.ip, &dest->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt), rtos); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); } daddr = dest->addr.ip; if (ret_saddr) - *ret_saddr = dest->dst_saddr.ip; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.ip; } else { - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = daddr; - fl4.flowi4_tos = rtos; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) { - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &daddr); - return NULL; - } + __be32 saddr = htonl(INADDR_ANY); + + noref = 0; + + /* For such unconfigured boxes avoid many route lookups + * for performance reasons because we do not remember saddr + */ + rt_mode &= ~IP_VS_RT_MODE_CONNECT; + rt = do_output_route4(net, daddr, rt_mode, &saddr); + if (!rt) + goto err_unreach; if (ret_saddr) - *ret_saddr = fl4.saddr; + *ret_saddr = saddr; } - local = rt->rt_flags & RTCF_LOCAL; + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", (rt->rt_flags & RTCF_LOCAL) ? "local":"non-local", &daddr); - ip_rt_put(rt); - return NULL; - } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " - "requires NAT method, dest: %pI4\n", - &ip_hdr(skb)->daddr, &daddr); - ip_rt_put(rt); - return NULL; + goto err_put; } - if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " - "to non-local address, dest: %pI4\n", - &ip_hdr(skb)->saddr, &daddr); - ip_rt_put(rt); - return NULL; + iph = ip_hdr(skb); + if (likely(!local)) { + if (unlikely(ipv4_is_loopback(iph->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI4 to non-local address, dest: %pI4\n", + &iph->saddr, &daddr); + goto err_put; + } + } else { + ort = skb_rtable(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !(ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to " + "local requires NAT method, dest: %pI4\n", + &iph->daddr, &daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; } - return rt; -} - -/* Reroute packet to local IPv4 stack after DNAT */ -static int -__ip_vs_reroute_locally(struct sk_buff *skb) -{ - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - struct net *net = dev_net(dev); - struct iphdr *iph = ip_hdr(skb); - - if (rt_is_input_route(rt)) { - unsigned long orefdst = skb->_skb_refdst; - - if (ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) - return 0; - refdst_drop(orefdst); + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + df = iph->frag_off & htons(IP_DF); } else { - struct flowi4 fl4 = { - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), - .flowi4_mark = skb->mark, - }; - - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return 0; - if (!(rt->rt_flags & RTCF_LOCAL)) { - ip_rt_put(rt); - return 0; + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; } - /* Drop old route. */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ort = skb_rtable(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + /* MTU check allowed? */ + df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; + } + + /* MTU checking */ + if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + goto err_put; } - return 1; + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #ifdef CONFIG_IP_VS_IPV6 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) { - return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; } static struct dst_entry * @@ -235,7 +325,7 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, goto out_err; } } - ipv6_addr_copy(ret_saddr, &fl6.saddr); + *ret_saddr = fl6.saddr; return dst; out_err: @@ -247,131 +337,196 @@ out_err: /* * Get route to destination or remote server */ -static struct rt6_info * +static int __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, - int do_xfrm, int rt_mode) + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { struct net *net = dev_net(skb_dst(skb)->dev); + struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct rt6_info *ort; /* Original route */ struct dst_entry *dst; - int local; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); - if (!rt) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { u32 cookie; + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, - &dest->dst_saddr.in6, + &dest_dst->dst_saddr.in6, do_xfrm); if (!dst) { - spin_unlock(&dest->dst_lock); - return NULL; + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } rt = (struct rt6_info *) dst; cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", - &dest->addr.in6, &dest->dst_saddr.in6, + &dest->addr.in6, &dest_dst->dst_saddr.in6, atomic_read(&rt->dst.__refcnt)); } if (ret_saddr) - ipv6_addr_copy(ret_saddr, &dest->dst_saddr.in6); - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.in6; } else { + noref = 0; dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) - return NULL; + goto err_unreach; rt = (struct rt6_info *) dst; } local = __ip_vs_is_local_route6(rt); if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", local ? "local":"non-local", daddr); - dst_release(&rt->dst); - return NULL; + goto err_put; } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = (struct rt6_info *) skb_dst(skb)) && - __ip_vs_is_local_route6(ort))) { - IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " - "requires NAT method, dest: %pI6\n", - &ipv6_hdr(skb)->daddr, daddr); - dst_release(&rt->dst); - return NULL; + if (likely(!local)) { + if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI6c to non-local address, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err_put; + } + } else { + ort = (struct rt6_info *) skb_dst(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !__ip_vs_is_local_route6(ort)) { + IP_VS_DBG_RL("Redirect from non-local address %pI6c " + "to local requires NAT method, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->daddr, daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; } - if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " - "to non-local address, dest: %pI6\n", - &ipv6_hdr(skb)->saddr, daddr); - dst_release(&rt->dst); - return NULL; + + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + ort = (struct rt6_info *) skb_dst(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } - return rt; + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #endif -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) { - struct dst_entry *old_dst; + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset(skb); + skb_forward_csum(skb); + } + return ret; +} - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; } -#define IP_VS_XMIT_TUNNEL(skb, cp) \ -({ \ - int __ret = NF_ACCEPT; \ - \ - (skb)->ipvs_property = 1; \ - if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ - __ret = ip_vs_confirm_conntrack(skb, cp); \ - if (__ret == NF_ACCEPT) { \ - nf_reset(skb); \ - skb_forward_csum(skb); \ - } \ - __ret; \ -}) - -#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - else \ - ip_vs_update_conntrack(skb, cp, 1); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) - -#define IP_VS_XMIT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} /* @@ -379,10 +534,10 @@ do { \ */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { /* we do not touch skb and do not need pskb ptr */ - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } @@ -393,54 +548,31 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - int mtu; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), - IP_VS_RT_MODE_NON_LOCAL, NULL))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, + NULL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -448,58 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - struct ipv6hdr *iph = ipv6_hdr(skb); - int mtu; - EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, - IP_VS_RT_MODE_NON_LOCAL))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu && !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -511,96 +612,71 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - int local; + int local, rc, was_input; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, - "ip_vs_nat_xmit(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error_put; + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) + goto tx_error; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - if (!local) { - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -608,66 +684,64 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; - int local; + int local, rc; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; - p = skb_header_pointer(skb, sizeof(struct ipv6hdr), - sizeof(_pt), &_pt); + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR)))) - goto tx_error_icmp; - local = __ip_vs_is_local_route6(rt); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif @@ -678,43 +752,20 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu && !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; - ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); - - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } + ipv6_hdr(skb)->daddr = cp->daddr.in6; IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); @@ -723,22 +774,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; -tx_error_icmp: - dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); + rcu_read_unlock(); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -764,66 +812,52 @@ tx_error_put: */ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; + __be16 df; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(tos), IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL, - &saddr))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } + rt = skb_rtable(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - if (mtu < 68) { - IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error_put; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF) && - mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } + /* Copy DF, reset fragment offset and MF */ + df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } - kfree_skb(skb); + + if (!new_skb) + goto tx_error; + consume_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); } @@ -837,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -853,36 +883,33 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ @@ -890,59 +917,38 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *old_iph = ipv6_hdr(skb); struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, - &saddr, 1, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } + rt = (struct rt6_info *) skb_dst(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); - if (mtu < IPV6_MIN_MTU) { - IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, - IPV6_MIN_MTU); - goto tx_error_put; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && - !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->dst); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } - kfree_skb(skb); + + if (!new_skb) + goto tx_error; + consume_skb(skb); skb = new_skb; old_iph = ipv6_hdr(skb); } @@ -953,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -967,32 +969,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); iph->priority = old_iph->priority; memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - ipv6_addr_copy(&iph->daddr, &cp->daddr.in6); - ipv6_addr_copy(&iph->saddr, &saddr); + iph->daddr = cp->daddr.in6; + iph->saddr = saddr; iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip6_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -1003,60 +1002,38 @@ tx_error_put: */ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL, NULL))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } ip_send_check(ip_hdr(skb)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1064,62 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1132,13 +1083,13 @@ tx_error: */ int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset, unsigned int hooknum) + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *iph) { struct rtable *rt; /* Route to the other host */ - int mtu; int rc; int local; - int rt_mode; + int rt_mode, was_input; EnterFunction(10); @@ -1147,7 +1098,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); + rc = cp->packet_xmit(skb, cp, pp, iph); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -1158,105 +1109,76 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * mangle and send the packet here (only for VS/NAT) */ + was_input = rt_is_input_route(skb_rtable(skb)); /* LOCALNODE from FORWARD hook is not supported */ rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(ip_hdr(skb)->tos), - rt_mode, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp(skb, pp, cp, 0); - if (!local) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); goto out; - tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset, unsigned int hooknum) + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; int rc; int local; int rt_mode; @@ -1268,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); + rc = cp->packet_xmit(skb, cp, pp, ipvsh); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -1284,25 +1206,26 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, rt_mode))) - goto tx_error_icmp; - - local = __ip_vs_is_local_route6(rt); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; + goto tx_error; } } #endif @@ -1313,58 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu && !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp_v6(skb, pp, cp, 0); - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); goto out; -tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif |
