diff options
Diffstat (limited to 'net/netfilter/ipvs')
29 files changed, 8247 insertions, 4000 deletions
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 79a69805221..0c3b1670b0d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -4,6 +4,7 @@ menuconfig IP_VS tristate "IP virtual server support" depends on NET && INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This @@ -26,13 +27,12 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" - depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + depends on IPV6 = y || IP_VS = IPV6 + select IP6_NF_IPTABLES ---help--- - Add IPv6 support to IPVS. This is incomplete and might be dangerous. + Add IPv6 support to IPVS. - See http://www.mindbasket.com/ipvs for more information. - - Say N if unsure. + Say Y if unsure. config IP_VS_DEBUG bool "IP virtual server debugging" @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + comment "IPVS transport protocol load balancing support" config IP_VS_PROTO_TCP @@ -83,23 +87,27 @@ config IP_VS_PROTO_UDP protocol. Say Y if unsure. config IP_VS_PROTO_AH_ESP - bool - depends on UNDEFINED + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH config IP_VS_PROTO_ESP bool "ESP load balancing support" - select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" - select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. +config IP_VS_PROTO_SCTP + bool "SCTP load balancing support" + select LIBCRC32C + ---help--- + This option enables support for load balancing SCTP transport + protocol. Say Y if unsure. + comment "IPVS scheduler" config IP_VS_RR @@ -112,7 +120,7 @@ config IP_VS_RR module, choose M here. If unsure, say N. config IP_VS_WRR - tristate "weighted round-robin scheduling" + tristate "weighted round-robin scheduling" ---help--- The weighted robin-robin scheduling algorithm directs network connections to different real servers based on server weights @@ -222,11 +230,28 @@ config IP_VS_NQ If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +comment 'IPVS SH scheduler' + +config IP_VS_SH_TAB_BITS + int "IPVS source hashing table size (the Nth power of 2)" + range 4 20 + default 8 + ---help--- + The source hashing scheduler maps source IPs to destinations + stored in a hash table. This table is tiled by each destination + until all slots in the table are filled. When using weights to + allow destinations to receive more connections, the table is + tiled an amount proportional to the weights specified. The table + needs to be large enough to effectively fit all the destinations + multiplied by their respective weights. + comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ + NF_CONNTRACK_FTP + select IP_VS_NFCT ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, @@ -238,4 +263,19 @@ config IP_VS_FTP If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + +config IP_VS_PE_SIP + tristate "SIP persistence engine" + depends on IP_VS_PROTO_UDP + depends on NF_CONNTRACK_SIP + ---help--- + Allow persistence based on the SIP Call-ID + endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 73a46fe1fe4..34ee602ddb6 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -7,11 +7,15 @@ ip_vs_proto-objs-y := ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o + +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) # IPVS core @@ -31,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o # IPVS application helpers obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +# IPVS connection template retrievers +obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 201b8ea3020..dfd7b65b3d2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -18,16 +18,19 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/tcp.h> -#include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -39,11 +42,8 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); static DEFINE_MUTEX(__ip_vs_app_mutex); - /* * Get an ip_vs_app object */ @@ -58,12 +58,25 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) module_put(app->module); } +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} /* * Allocate/initialize app incarnation and register it in proto apps. */ static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) { struct ip_vs_protocol *pp; struct ip_vs_app *inc; @@ -94,19 +107,18 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) } } - ret = pp->register_app(inc); + ret = pp->register_app(net, inc); if (ret) goto out; list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); + IP_VS_DBG(9, "%s App %s:%u registered\n", + pp->name, inc->name, ntohs(inc->port)); return 0; out: - kfree(inc->timeout_table); - kfree(inc); + ip_vs_app_inc_destroy(inc); return ret; } @@ -115,7 +127,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) * Release app incarnation */ static void -ip_vs_app_inc_release(struct ip_vs_app *inc) +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; @@ -123,15 +135,14 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) return; if (pp->unregister_app) - pp->unregister_app(inc); + pp->unregister_app(net, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); + pp->name, inc->name, ntohs(inc->port)); list_del(&inc->a_list); - kfree(inc->timeout_table); - kfree(inc); + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); } @@ -143,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) { int result; - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); return result; } @@ -155,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) */ void ip_vs_app_inc_put(struct ip_vs_app *inc) { - ip_vs_app_put(inc->app); atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); } @@ -164,13 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc) * Register an application incarnation in protocol applications */ int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) { int result; mutex_lock(&__ip_vs_app_mutex); - result = ip_vs_app_inc_new(app, proto, port); + result = ip_vs_app_inc_new(net, app, proto, port); mutex_unlock(&__ip_vs_app_mutex); @@ -178,51 +190,79 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) } -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) +/* Register application for netns */ +struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) { - /* increase the module use count */ - ip_vs_use_count_inc(); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_app *a; + int err = 0; + + if (!ipvs) + return ERR_PTR(-ENOENT); mutex_lock(&__ip_vs_app_mutex); - list_add(&app->a_list, &ip_vs_app_list); + list_for_each_entry(a, &ipvs->app_list, a_list) { + if (!strcmp(app->name, a->name)) { + err = -EEXIST; + goto out_unlock; + } + } + a = kmemdup(app, sizeof(*app), GFP_KERNEL); + if (!a) { + err = -ENOMEM; + goto out_unlock; + } + INIT_LIST_HEAD(&a->incs_list); + list_add(&a->a_list, &ipvs->app_list); + /* increase the module use count */ + ip_vs_use_count_inc(); +out_unlock: mutex_unlock(&__ip_vs_app_mutex); - return 0; + return err ? ERR_PTR(err) : a; } /* * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() */ -void unregister_ip_vs_app(struct ip_vs_app *app) +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct ip_vs_app *inc, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_app *a, *anxt, *inc, *nxt; + + if (!ipvs) + return; mutex_lock(&__ip_vs_app_mutex); - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); - } + list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { + if (app && strcmp(app->name, a->name)) + continue; + list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { + ip_vs_app_inc_release(net, inc); + } - list_del(&app->a_list); + list_del(&a->a_list); + kfree(a); - mutex_unlock(&__ip_vs_app_mutex); + /* decrease the module use count */ + ip_vs_use_count_dec(); + } - /* decrease the module use count */ - ip_vs_use_count_dec(); + mutex_unlock(&__ip_vs_app_mutex); } /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) { return pp->app_conn_bind(cp); } @@ -262,12 +302,12 @@ static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) if (vseq->delta || vseq->previous_delta) { if(after(seq, vseq->init_seq)) { th->seq = htonl(seq + vseq->delta); - IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", - vseq->delta); + IP_VS_DBG(9, "%s(): added delta (%d) to seq\n", + __func__, vseq->delta); } else { th->seq = htonl(seq + vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " - "(%d) to seq\n", vseq->previous_delta); + IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n", + __func__, vseq->previous_delta); } } } @@ -291,14 +331,14 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) to receive next, so compare it with init_seq+delta */ if(after(ack_seq, vseq->init_seq+vseq->delta)) { th->ack_seq = htonl(ack_seq - vseq->delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " - "(%d) from ack_seq\n", vseq->delta); + IP_VS_DBG(9, "%s(): subtracted delta " + "(%d) from ack_seq\n", __func__, vseq->delta); } else { th->ack_seq = htonl(ack_seq - vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + IP_VS_DBG(9, "%s(): subtracted " "previous_delta (%d) from ack_seq\n", - vseq->previous_delta); + __func__, vseq->previous_delta); } } } @@ -309,17 +349,17 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) * Assumes already checked proto==IPPROTO_TCP and diff!=0. */ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) + unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { vseq->previous_delta = vseq->delta; vseq->delta += diff; vseq->init_seq = seq; cp->flags |= flag; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, @@ -477,11 +517,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) * /proc/net/ip_vs_app entry function */ -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) { struct ip_vs_app *app, *inc; - list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(app, &ipvs->app_list, a_list) { list_for_each_entry(inc, &app->incs_list, a_list) { if (pos-- == 0) return inc; @@ -493,19 +533,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos) static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + mutex_lock(&__ip_vs_app_mutex); - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_app *inc, *app; struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); + return ip_vs_app_idx(ipvs, 0); inc = v; app = inc->app; @@ -514,7 +559,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) return list_entry(e, struct ip_vs_app, a_list); /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { app = list_entry(e, struct ip_vs_app, a_list); list_for_each_entry(inc, &app->incs_list, a_list) { return inc; @@ -553,7 +598,8 @@ static const struct seq_operations ip_vs_app_seq_ops = { static int ip_vs_app_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_app_seq_ops); + return seq_open_net(inode, file, &ip_vs_app_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ip_vs_app_fops = { @@ -561,62 +607,21 @@ static const struct file_operations ip_vs_app_fops = { .open = ip_vs_app_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; #endif - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) +int __net_init ip_vs_app_net_init(struct net *net) { - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); + struct netns_ipvs *ipvs = net_ipvs(net); - LeaveFunction(9); + INIT_LIST_HEAD(&ipvs->app_list); + proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } - -int __init ip_vs_app_init(void) -{ - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - - -void ip_vs_app_cleanup(void) +void __net_exit ip_vs_app_net_cleanup(struct net *net) { - proc_net_remove(&init_net, "ip_vs_app"); + unregister_ip_vs_app(net, NULL /* all */); + remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 60aba45023f..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -22,6 +22,9 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/interrupt.h> #include <linux/in.h> #include <linux/net.h> @@ -29,6 +32,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ +#include <linux/slab.h> #include <linux/seq_file.h> #include <linux/jhash.h> #include <linux/random.h> @@ -37,125 +41,148 @@ #include <net/ip_vs.h> +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; + /* * Connection hash table: for input and output packets lookups of IPVS */ -static struct list_head *ip_vs_conn_tab; +static struct hlist_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - /* counter for no client port connections */ static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); /* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; +static unsigned int ip_vs_conn_rnd __read_mostly; /* * Fine locking granularity for big connection hash table */ -#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_BITS 5 #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) struct ip_vs_aligned_lock { - rwlock_t l; + spinlock_t l; } __attribute__((__aligned__(SMP_CACHE_BYTES))); /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned key) +static inline void ct_write_lock_bh(unsigned int key) { - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_read_unlock(unsigned key) +static inline void ct_write_unlock_bh(unsigned int key) { - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_write_lock(unsigned key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} -static inline void ct_write_unlock(unsigned key) +/* + * Returns hash value for IPVS connection entry + */ +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, + const union nf_inet_addr *addr, + __be16 port) { - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), + (__force u32)port, proto, ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; +#endif + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; } -static inline void ct_read_lock_bh(unsigned key) +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) { - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} + const union nf_inet_addr *addr; + __be16 port; -static inline void ct_read_unlock_bh(unsigned key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} + if (p->pe_data && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; -static inline void ct_write_lock_bh(unsigned key) -{ - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); } -static inline void ct_write_unlock_bh(unsigned key) +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) { - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); -/* - * Returns hash value for IPVS connection entry - */ -static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, - const union nf_inet_addr *addr, - __be16 port) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), - (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -#endif - return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, - ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -} + if (cp->pe) { + p.pe = cp->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + return ip_vs_conn_hashkey_param(&p, false); +} /* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. * returns bool success. */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return 0; + /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); + spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { - IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); ret = 0; } - ct_write_unlock(hash); + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); return ret; } @@ -163,27 +190,59 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. + * returns bool success. Caller should hold conn reference. */ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); + spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); + hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; atomic_dec(&cp->refcnt); ret = 1; } else ret = 0; - ct_write_unlock(hash); + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret; + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + ret = false; + /* Decrease refcnt and unlink conn only if we are last user */ + if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } else + ret = atomic_read(&cp->refcnt) ? false : true; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); return ret; } @@ -192,149 +251,206 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) /* * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol == cp->protocol) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); return cp; } } - ct_read_unlock(hash); + rcu_read_unlock(); return NULL; } -struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { struct ip_vs_conn *cp; - cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, - d_port); + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), cp ? "hit" : "not hit"); return cp; } +static int +ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + int inverse, struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + struct net *net = skb_net(skb); + + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + if (pptr == NULL) + return 1; + + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); + return 0; +} + +struct ip_vs_conn * +ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + return NULL; + + return ip_vs_conn_in_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + /* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } + continue; + } + + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - goto out; + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (__ip_vs_conn_get(cp)) + goto out; } } cp = NULL; out: - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), cp ? "hit" : "not hit"); return cp; } -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp, *ret=NULL; /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, d_addr, &cp->caddr) && - ip_vs_addr_equal(af, s_addr, &cp->daddr) && - d_port == cp->cport && s_port == cp->dport && - protocol == cp->protocol) { + hash = ip_vs_conn_hashkey_param(p, true); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport == cp->cport && p->cport == cp->dport && + cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); ret = cp; break; } } - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), ret ? "hit" : "not hit"); return ret; } +struct ip_vs_conn * +ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + return NULL; + + return ip_vs_conn_out_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); /* * Put back the conn and restart its timer with its timeout */ void ip_vs_conn_put(struct ip_vs_conn *cp) { - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); + unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? + 0 : cp->timeout; + mod_timer(&cp->timer, jiffies+t); __ip_vs_conn_put(cp); } @@ -346,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->flags & IP_VS_CONN_F_NO_CPORT) { atomic_dec(&ip_vs_conn_no_cport_cnt); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* hash on new dport */ ip_vs_conn_hash(cp); @@ -430,23 +546,32 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) static inline void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) { + unsigned int conn_flags; + __u32 flags; + /* if dest is NULL, then return directly */ if (!dest) return; /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); + conn_flags = atomic_read(&dest->conn_flags); + if (cp->protocol != IPPROTO_UDP) + conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + if (flags & IP_VS_CONN_F_SYNC) { /* if the connection is not template and is created * by sync, preserve the activity flag. */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); + } + flags |= conn_flags; + cp->flags = flags; cp->dest = dest; IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " @@ -461,18 +586,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_read(&dest->refcnt)); /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) atomic_inc(&dest->activeconns); else atomic_inc(&dest->inactconns); } else { /* It is a persistent connection/template, so increase - the peristent connection counter */ + the persistent connection counter */ atomic_inc(&dest->persistconns); } @@ -486,18 +611,46 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) * Check if there is a destination for the connection, if so * bind the connection to the destination. */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, - &cp->vaddr, cp->vport, - cp->protocol); + rcu_read_lock(); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock_bh(&cp->lock); + if (cp->dest) { + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; + spin_unlock_bh(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + rcu_read_unlock(); } @@ -534,7 +687,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) } } else { /* It is a persistent connection/template, so decrease - the peristent connection counter */ + the persistent connection counter */ atomic_dec(&dest->persistconns); } @@ -549,14 +702,19 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; } - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); + ip_vs_dest_put(dest); } +static int expire_quiescent_template(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL + return ipvs->sysctl_expire_quiescent_template && + (atomic_read(&dest->weight) == 0); +#else + return 0; +#endif +} /* * Checking if the destination of a connection template is available. @@ -566,14 +724,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) int ip_vs_check_template(struct ip_vs_conn *ct) { struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); /* * Checking the dest server status. */ if ((dest == NULL) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { + expire_quiescent_template(ipvs, dest)) { IP_VS_DBG_BUF(9, "check_template: dest not available for " "protocol %s s:%s:%d v:%s:%d " "-> d:%s:%d\n", @@ -601,22 +759,27 @@ int ip_vs_check_template(struct ip_vs_conn *ct) * Simply decrease the refcnt of the template, * don't restart its timer. */ - atomic_dec(&ct->refcnt); + __ip_vs_conn_put(ct); return 0; } return 1; } -static void ip_vs_conn_expire(unsigned long data) +static void ip_vs_conn_rcu_free(struct rcu_head *head) { - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); - cp->timeout = 60*HZ; + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct net *net = ip_vs_conn_net(cp); + struct netns_ipvs *ipvs = net_ipvs(net); /* * do I control anybody? @@ -624,51 +787,60 @@ static void ip_vs_conn_expire(unsigned long data) if (atomic_read(&cp->n_control)) goto expire_later; - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); + del_timer(&cp->timer); /* does anybody control me? */ if (cp->control) ip_vs_control_del(cp); + if (cp->flags & IP_VS_CONN_F_NFCT) { + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } + if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + atomic_dec(&ipvs->conn_count); return; } - /* hash it back to the table */ - ip_vs_conn_hash(cp); - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt), atomic_read(&cp->n_control)); + atomic_inc(&cp->refcnt); + cp->timeout = 60*HZ; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_conn_put(cp); } - +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); } @@ -676,31 +848,46 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) +ip_vs_conn_new(const struct ip_vs_conn_param *p, + const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, + struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + struct netns_ipvs *ipvs = net_ipvs(p->net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + p->protocol); - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { - IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + IP_VS_ERR_RL("%s(): no memory\n", __func__); return NULL; } - INIT_LIST_HEAD(&cp->c_list); + INIT_HLIST_NODE(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->af = af; - cp->protocol = proto; - ip_vs_addr_copy(af, &cp->caddr, caddr); - cp->cport = cport; - ip_vs_addr_copy(af, &cp->vaddr, vaddr); - cp->vport = vport; - ip_vs_addr_copy(af, &cp->daddr, daddr); + ip_vs_conn_net_set(cp, p->net); + cp->af = p->af; + cp->protocol = p->protocol; + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->vaddr, p->vaddr); + cp->vport = p->vport; + ip_vs_addr_set(p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; + cp->fwmark = fwmark; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; + } spin_lock_init(&cp->lock); /* @@ -710,30 +897,51 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, */ atomic_set(&cp->refcnt, 1); + cp->control = NULL; atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); - atomic_inc(&ip_vs_conn_count); + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); /* Bind the connection with a destination server */ + cp->dest = NULL; ip_vs_bind_dest(cp, dest); /* Set its state and timeout */ cp->state = 0; + cp->old_state = 0; cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) + if (p->af == AF_INET6) ip_vs_bind_xmit_v6(cp); else #endif ip_vs_bind_xmit(cp); - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); + + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled(ipvs)) + cp->flags |= IP_VS_CONN_F_NFCT; /* Hash it in the ip_vs_conn_tab finally */ ip_vs_conn_hash(cp); @@ -741,41 +949,53 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, return cp; } - /* * /proc/net/ip_vs_conn entries */ #ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct hlist_head *l; +}; static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) { int idx; struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; + iter->l = &ip_vs_conn_tab[idx]; return cp; } } - ct_read_unlock_bh(idx); + cond_resched_rcu(); } return NULL; } static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { - seq->private = NULL; + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; + rcu_read_lock(); return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *e; + struct hlist_head *l = iter->l; int idx; ++*pos; @@ -783,30 +1003,26 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ip_vs_conn_array(seq, 0); /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) + return hlist_entry(e, struct ip_vs_conn, c_list); idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - - while (++idx < IP_VS_CONN_TAB_SIZE) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; + while (++idx < ip_vs_conn_tab_size) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + iter->l = &ip_vs_conn_tab[idx]; return cp; } - ct_read_unlock_bh(idx); + cond_resched_rcu(); } - seq->private = NULL; + iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct list_head *l = seq->private; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); + rcu_read_unlock(); } static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) @@ -814,30 +1030,46 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + if (cp->pe_data) { + pe_data[0] = ' '; + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->pe->show_pe_data(cp, pe_data + len); + } + pe_data[len] = '\0'; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n", + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%pI6 %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), &cp->daddr.in6, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); + (cp->timer.expires-jiffies)/HZ, pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu\n", + " %08X %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); + (cp->timer.expires-jiffies)/HZ, pe_data); } return 0; } @@ -851,7 +1083,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = { static int ip_vs_conn_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_conn_seq_ops); + return seq_open_net(inode, file, &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state)); } static const struct file_operations ip_vs_conn_fops = { @@ -859,10 +1092,10 @@ static const struct file_operations ip_vs_conn_fops = { .open = ip_vs_conn_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; -static const char *ip_vs_origin_name(unsigned flags) +static const char *ip_vs_origin_name(unsigned int flags) { if (flags & IP_VS_CONN_F_SYNC) return "SYNC"; @@ -878,6 +1111,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); else { const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -914,7 +1151,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_conn_sync_seq_ops); + return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state)); } static const struct file_operations ip_vs_conn_sync_fops = { @@ -922,7 +1160,7 @@ static const struct file_operations ip_vs_conn_sync_fops = { .open = ip_vs_conn_sync_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; #endif @@ -960,27 +1198,24 @@ static inline int todrop_entry(struct ip_vs_conn *cp) } /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) +void ip_vs_random_dropentry(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; + rcu_read_lock(); /* * Randomly scan 1/32 of the whole table every second */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; - + if (!ip_vs_conn_net_eq(cp, net)) + continue; if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -995,6 +1230,18 @@ void ip_vs_random_dropentry(void) default: continue; } + } else if (cp->protocol == IPPROTO_SCTP) { + switch (cp->state) { + case IP_VS_SCTP_S_INIT1: + case IP_VS_SCTP_S_INIT: + break; + case IP_VS_SCTP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + default: + continue; + } } else { if (!todrop_entry(cp)) continue; @@ -1002,60 +1249,91 @@ void ip_vs_random_dropentry(void) IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(hash); + cond_resched_rcu(); } + rcu_read_unlock(); } /* * Flush all the connection entries in the ip_vs_conn_tab */ -static void ip_vs_conn_flush(void) +static void ip_vs_conn_flush(struct net *net) { int idx; - struct ip_vs_conn *cp; - - flush_again: - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(idx); + struct ip_vs_conn *cp, *cp_c; + struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { +flush_again: + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (!ip_vs_conn_net_eq(cp, net)) + continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(idx); + cond_resched_rcu(); } + rcu_read_unlock(); /* the counter may be not NULL, because maybe some conn entries are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { + if (atomic_read(&ipvs->conn_count) != 0) { schedule(); goto flush_again; } } +/* + * per netns init and exit + */ +int __net_init ip_vs_conn_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + atomic_set(&ipvs->conn_count, 0); + + proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); + return 0; +} + +void __net_exit ip_vs_conn_net_cleanup(struct net *net) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(net); + remove_proc_entry("ip_vs_conn", net->proc_net); + remove_proc_entry("ip_vs_conn_sync", net->proc_net); +} int __init ip_vs_conn_init(void) { int idx; + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1068,39 +1346,31 @@ int __init ip_vs_conn_init(void) return -ENOMEM; } - IP_VS_INFO("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + pr_info("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) + INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); } - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); - /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); return 0; } - void ip_vs_conn_cleanup(void) { - /* flush all the connection entries first */ - ip_vs_conn_flush(); - + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); vfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index cb3e031335e..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -24,17 +24,24 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> +#include <linux/sctp.h> #include <linux/icmp.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ #include <net/route.h> +#include <net/ip6_checksum.h> +#include <net/netns/generic.h> /* net_generic() */ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -42,6 +49,7 @@ #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <linux/netfilter_ipv6.h> +#include <net/ip6_route.h> #endif #include <net/ip_vs.h> @@ -49,7 +57,6 @@ EXPORT_SYMBOL(register_ip_vs_scheduler); EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); EXPORT_SYMBOL(ip_vs_proto_name); EXPORT_SYMBOL(ip_vs_conn_new); EXPORT_SYMBOL(ip_vs_conn_in_get); @@ -62,12 +69,15 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +static int ip_vs_net_id __read_mostly; +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) -const char *ip_vs_proto_name(unsigned proto) +const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; @@ -78,6 +88,8 @@ const char *ip_vs_proto_name(unsigned proto) return "UDP"; case IPPROTO_TCP: return "TCP"; + case IPPROTO_SCTP: + return "SCTP"; case IPPROTO_ICMP: return "ICMP"; #ifdef CONFIG_IP_VS_IPV6 @@ -85,7 +97,7 @@ const char *ip_vs_proto_name(unsigned proto) return "ICMPv6"; #endif default: - sprintf(buf, "IP_%d", proto); + sprintf(buf, "IP_%u", proto); return buf; } } @@ -100,21 +112,32 @@ static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.inpkts++; - dest->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.inpkts++; - dest->svc->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.inpkts++; - ip_vs_stats.ustats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); + struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + rcu_read_unlock(); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); } } @@ -123,21 +146,32 @@ static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.outpkts++; - dest->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.outpkts++; - dest->svc->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.outpkts++; - ip_vs_stats.ustats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); + struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + rcu_read_unlock(); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); } } @@ -145,30 +179,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.ustats.conns++; - spin_unlock(&cp->dest->stats.lock); + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(cp->dest->stats.cpustats); + s->ustats.conns++; - spin_lock(&svc->stats.lock); - svc->stats.ustats.conns++; - spin_unlock(&svc->stats.lock); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.conns++; - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.conns++; - spin_unlock(&ip_vs_stats.lock); + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.conns++; } -static inline int +static inline void ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { - if (unlikely(!pp->state_transition)) - return 0; - return pp->state_transition(cp, direction, skb, pp); + if (likely(pd->pp->state_transition)) + pd->pp->state_transition(cp, direction, skb, pd); } +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + vport, p); + p->pe = rcu_dereference(svc->pe); + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + + return 0; +} /* * IPVS persistent scheduling function @@ -179,31 +227,32 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, - __be16 ports[2]) + struct sk_buff *skb, __be16 src_port, __be16 dst_port, + int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ + __be16 dport = 0; /* destination port to forward */ + unsigned int flags; + struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + (__force __u32) svc->netmask); else #endif - snet.ip = iph.saddr.ip & svc->netmask; + snet.ip = iph->saddr.ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -219,131 +268,99 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> * is created for other persistent services. */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, ports[1]); - else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like <protocol,caddr,0, - * vaddr,vport,daddr,dport> for non-ftp service, - * and <protocol,caddr,0,vaddr,0,daddr,0> - * for ftp service. + { + int protocol = iph->protocol; + const union nf_inet_addr *vaddr = &iph->daddr; + __be16 vport = 0; + + if (dst_port == svc->port) { + /* non-FTP template: + * <protocol, caddr, 0, vaddr, vport, daddr, dport> + * FTP template: + * <protocol, caddr, 0, vaddr, 0, daddr, 0> */ if (svc->port != FTPPORT) - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, - ports[1], - &dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; + vport = dst_port; } else { - /* set destination with the found template */ - dest = ct->dest; + /* Note: persistent fwmark-based services and + * persistent port zero service are handled here. + * fwmark template: + * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> + * port zero template: + * <protocol,caddr,0,vaddr,0,daddr,0> + */ + if (svc->fwmark) { + protocol = IPPROTO_IP; + vaddr = &fwmark; + } } - dport = dest->port; - } else { + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } + } + + /* Check if a template already exists */ + ct = ip_vs_ct_in_get(¶m); + if (!ct || !ip_vs_check_template(ct)) { + struct ip_vs_scheduler *sched; + /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> - * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> + * No template found or the dest of the connection + * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, - &fwmark, 0); - } else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); + if (!dest) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); + *ignored = 0; + return NULL; + } - /* - * Create a template according to the service - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_conn_new(svc->af, IPPROTO_IP, - &snet, 0, - &fwmark, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - } else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; + if (dst_port == svc->port && svc->port != FTPPORT) + dport = dest->port; + + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ + ct = ip_vs_conn_new(¶m, &dest->addr, dport, + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); + if (ct == NULL) { + kfree(param.pe_data); + *ignored = -1; + return NULL; } - dport = ports[1]; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + kfree(param.pe_data); } + dport = dst_port; + if (dport == svc->port && dest->port) + dport = dest->port; + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, ports[0], - &iph.daddr, ports[1], - &dest->addr, dport, - 0, - dest); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, + src_port, &iph->daddr, dst_port, ¶m); + + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); + *ignored = -1; return NULL; } @@ -363,54 +380,113 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *ignored, + struct ip_vs_iphdr *iph) { + struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; + struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; + unsigned int flags; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + *ignored = 1; + /* + * IPv6 frags, only the first hit here. + */ + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return NULL; /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + + /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr); + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + iph); + + *ignored = 0; /* * Non-persistent service */ if (!svc->fwmark && pptr[1] != svc->port) { if (!svc->port) - IP_VS_ERR("Schedule: port zero only supported " - "in persistent services, " - "check your ipvs configuration\n"); + pr_err("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); return NULL; } - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + /* * Create a connection entry. */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &dest->addr, dest->port ? dest->port : pptr[1], - 0, - dest); - if (cp == NULL) - return NULL; + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], &iph->daddr, + pptr[1], &p); + cp = ip_vs_conn_new(&p, &dest->addr, + dest->port ? dest->port : pptr[1], + flags, dest, skb->mark); + if (!cp) { + *ignored = -1; + return NULL; + } + } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", @@ -431,61 +507,71 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { __be16 _ports[2], *pptr; - struct ip_vs_iphdr iph; +#ifdef CONFIG_SYSCTL + struct net *net; + struct netns_ipvs *ipvs; int unicast; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); +#endif - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) { - ip_vs_service_put(svc); return NF_DROP; } +#ifdef CONFIG_SYSCTL + net = skb_net(skb); + #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; else #endif - unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { - int ret, cs; + ipvs = net_ipvs(net); + if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + int ret; struct ip_vs_conn *cp; + unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - ip_vs_service_put(svc); - /* create a new connection entry */ - IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &daddr, 0, - IP_VS_CONN_F_BYPASS, - NULL); - if (cp == NULL) - return NF_DROP; + IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL, skb->mark); + if (!cp) + return NF_DROP; + } /* statistics */ ip_vs_in_stats(cp, skb); /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); + ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } +#endif /* * When the virtual ftp service is presented, packets destined @@ -493,12 +579,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) return NF_ACCEPT; - } - - ip_vs_service_put(svc); /* * Notify the client that the destination is unreachable, and @@ -508,58 +590,89 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, - skb->dev); - else + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net_ = dev_net(skb_dst(skb)->dev); + + skb->dev = net_->loopback_dev; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } +#ifdef CONFIG_SYSCTL -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static int sysctl_snat_reroute(struct sk_buff *skb) { - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + return ipvs->sysctl_snat_reroute; } +static int sysctl_nat_icmp_send(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + return ipvs->sysctl_nat_icmp_send; +} + +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_expire_nodest_conn; +} + +#else + +static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } +static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } + +#endif + __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { - int err = ip_defrag(skb, user); + int err; + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); return err; } -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +static int ip_vs_route_me_harder(int af, struct sk_buff *skb) { - /* TODO IPv6: Find out what to do here for IPv6 */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) + return 1; + } else +#endif + if ((sysctl_snat_reroute(skb) || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + return 1; + return 0; } -#endif /* * Packet has been made sufficiently writable in caller @@ -586,8 +699,9 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, ip_send_check(ciph); } - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || + IPPROTO_SCTP == ciph->protocol) { __be16 *ports = (void *)ciph + ciph->ihl*4; if (inout) @@ -602,10 +716,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } @@ -614,10 +728,19 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct ipv6hdr *iph = ipv6_hdr(skb); - unsigned int icmp_offset = sizeof(struct ipv6hdr); - struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + - icmp_offset); - struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + unsigned int icmp_offset = 0; + unsigned int offs = 0; /* header offset*/ + int protocol; + struct icmp6hdr *icmph; + struct ipv6hdr *ciph; + unsigned short fragoffs; + + ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); + icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); + offs = icmp_offset + sizeof(struct icmp6hdr); + ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); + + protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); if (inout) { iph->saddr = cp->vaddr.in6; @@ -627,10 +750,14 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, ciph->saddr = cp->daddr.in6; } - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { - __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + /* the TCP/UDP/SCTP port */ + if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol)) { + __be16 *ports = (void *)(skb_network_header(skb) + offs); + IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, + ntohs(inout ? ports[1] : ports[0]), + ntohs(inout ? cp->vport : cp->dport)); if (inout) ports[1] = cp->vport; else @@ -638,22 +765,26 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); } #endif /* Handle relevant response ICMP messages - forward to the right - * destination host. Used for NAT and local client. + * destination host. */ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, @@ -664,8 +795,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); + pr_err("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); } /* Ensure the checksum is correct */ @@ -676,7 +807,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, goto out; } - if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); if (!skb_make_writable(skb, offset)) goto out; @@ -688,10 +820,17 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); + if (ip_vs_route_me_harder(af, skb)) + goto out; + /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: @@ -705,7 +844,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -719,8 +859,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) *related = 1; /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -763,52 +903,38 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - - offset += cih->ihl * 4; + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); - ip_vs_fill_iphdr(AF_INET, cih, &ciph); + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); if (!cp) return NF_ACCEPT; snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, offset, ihl); + pp, ciph.len, ihl); } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum, struct ip_vs_iphdr *ipvsh) { - struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; + struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset; union nf_inet_addr snet; + unsigned int writable; *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); if (ic == NULL) return NF_DROP; - IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - &iph->saddr, &iph->daddr); - /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy @@ -816,44 +942,63 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (ipvsh->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &ipvsh->saddr, &ipvsh->daddr); /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) + ciph.len = ipvsh->len + sizeof(_icmph); + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); + ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + + pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); if (!cp) return NF_ACCEPT; - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, - pp, offset, sizeof(struct ipv6hdr)); + snet.in6 = ciph.saddr.in6; + writable = ciph.len; + return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, + pp, writable, sizeof(struct ipv6hdr)); } #endif +/* + * Check if sctp chunc is ABORT chunk + */ +static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) +{ + sctp_chunkhdr_t *sch, schunk; + sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return 0; + if (sch->type == SCTP_CID_ABORT) + return 1; + return 0; +} + static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) { struct tcphdr _tcph, *th; @@ -864,20 +1009,47 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + sctp_chunkhdr_t *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... - * Used for NAT and local client. */ static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int ihl) +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + struct ip_vs_protocol *pp = pd->pp; - if (!skb_make_writable(skb, ihl)) + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, iph->len)) goto drop; /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 @@ -890,28 +1062,34 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_send_check(ip_hdr(skb)); } + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + /* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through. We want this packet to be routed as * if it came from this machine itself. So re-compute * the routing information. */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; + if (ip_vs_route_me_harder(af, skb)) + goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + ip_vs_conn_put(cp); LeaveFunction(11); return NF_ACCEPT; @@ -919,123 +1097,189 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, drop: ip_vs_conn_put(cp); kfree_skb(skb); + LeaveFunction(11); return NF_STOLEN; } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { + struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; - int af; EnterFunction(11); - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - + /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + ip_vs_fill_iph_skb(af, skb, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum, &iph); if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) return NF_ACCEPT; + pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else + if (af == AF_INET) #endif - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) return NF_STOLEN; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb, iph.len)) { + cp = pp->conn_out_get(af, skb, &iph, 0); + + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph); + if (sysctl_nat_icmp_send(net) && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = frag_safe_skb_hp(skb, iph.len, + sizeof(_ports), _ports, &iph); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0, skb->dev); - else + if (af == AF_INET6) { + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else #endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; } } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; } + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; +} - return handle_response(af, skb, pp, cp, iph.len); +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET); } +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET); +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET6); +} + +#endif /* * Handle ICMP messages in the outside-to-inside direction (incoming). @@ -1046,21 +1290,22 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, static int ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) { + struct net *net = NULL; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - union nf_inet_addr snet; + struct ip_vs_proto_data *pd; + unsigned int offset, offset2, ihl, verdict; + bool ipip; *related = 1; /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1094,33 +1339,46 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(cih->protocol); - if (!pp) + net = skb_net(skb); + + /* Special case for errors for IPIP packets */ + ipip = false; + if (cih->protocol == IPPROTO_IPIP) { + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + /* Error for our IPIP must arrive at LOCAL_IN */ + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) + return NF_ACCEPT; + offset += cih->ihl * 4; + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + ipip = true; + } + + pd = ip_vs_proto_data_get(net, cih->protocol); + if (!pd) return NF_ACCEPT; + pp = pd->pp; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - - offset += cih->ihl * 4; + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (cp) { - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, - cih->protocol, cp, pp, - offset, ihl); - } + offset2 = offset; + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; + offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. + * For IPIP this is error for request, not for reply. + */ + cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); + if (!cp) return NF_ACCEPT; - } verdict = NF_DROP; @@ -1132,53 +1390,95 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) goto out; } + if (ipip) { + __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; + + /* Update the MTU */ + if (ic->type == ICMP_DEST_UNREACH && + ic->code == ICMP_FRAG_NEEDED) { + struct ip_vs_dest *dest = cp->dest; + u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; + + /* Strip outer IP and ICMP, go to IPIP header */ + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; + offset2 -= ihl + sizeof(_icmph); + skb_reset_network_header(skb); + IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); + ipv4_update_pmtu(skb, dev_net(skb->dev), + mtu, 0, 0, 0, 0); + /* Client uses PMTUD? */ + if (!(frag_off & htons(IP_DF))) + goto ignore_ipip; + /* Prefer the resulting PMTU */ + if (dest) { + struct ip_vs_dest_dst *dest_dst; + + rcu_read_lock(); + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + rcu_read_unlock(); + } + if (mtu > 68 + sizeof(struct iphdr)) + mtu -= sizeof(struct iphdr); + info = htonl(mtu); + } + /* Strip outer IP, ICMP and IPIP, go to IP header of + * original request. + */ + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; + skb_reset_network_header(skb); + IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + type, code, ntohl(info)); + icmp_send(skb, type, code, info); + /* ICMP can be shorter but anyways, account it */ + ip_vs_out_stats(cp, skb); + +ignore_ipip: + consume_skb(skb); + verdict = NF_STOLEN; + goto out; + } + /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || + IPPROTO_SCTP == cih->protocol) offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); - out: +out: __ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum, struct ip_vs_iphdr *iph) { - struct ipv6hdr *iph; + struct net *net = NULL; + struct ipv6hdr _ip6h, *ip6h; struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset, verdict; - union nf_inet_addr snet; + struct ip_vs_proto_data *pd; + unsigned int offs_ciph, writable, verdict; *related = 1; - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); if (ic == NULL) return NF_DROP; - IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - &iph->saddr, &iph->daddr); - /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy @@ -1186,56 +1486,71 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (iph->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) + ciph.len = iph->len + sizeof(_icmph); + offs_ciph = ciph.len; /* Save ip header offset */ + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) + ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, ciph.protocol); + if (!pd) return NF_ACCEPT; + pp = pd->pp; - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + /* Cannot handle fragmented embedded protocol */ + if (ciph.fragoffs) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, + "Checking incoming ICMPv6 for"); - offset += sizeof(struct ipv6hdr); + /* The embedded headers contain source and dest in reverse order + * if not from localhost + */ + cp = pp->conn_in_get(AF_INET6, skb, &ciph, + (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (cp) { - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, - cih->nexthdr, - cp, pp, offset, - sizeof(struct ipv6hdr)); - } + if (!cp) + return NF_ACCEPT; + /* VS/TUN, VS/DR and LOCALNODE just let it go */ + if ((hooknum == NF_INET_LOCAL_OUT) && + (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { + __ip_vs_conn_put(cp); return NF_ACCEPT; } - verdict = NF_DROP; - /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ + + /* Need to mangle contained IPv6 header in ICMPv6 packet */ + writable = ciph.len; + if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || + IPPROTO_SCTP == ciph.protocol) + writable += 2 * sizeof(__u16); /* Also mangle ports */ + + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); __ip_vs_conn_put(cp); @@ -1249,75 +1564,122 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { + struct net *net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; - int ret, restart, af; - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + int ret, pkts; + struct netns_ipvs *ipvs; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iph_skb(af, skb, &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + ip_vs_fill_iph_skb(af, skb, &iph); - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; } +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, + &iph); + + if (related) + return verdict; + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + } + /* Protocol supported? */ - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) return NF_ACCEPT; - + pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, &iph, 0); - if (unlikely(!cp)) { - int v; + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && + unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && + is_new_conn(skb, &iph)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); + if (unlikely(!cp) && !iph.fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ + int v; - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) + /* Schedule and create new connection entry into &cp */ + if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) return v; } if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_in: packet continues traversal as normal"); + if (iph.fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); + } return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); - + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ - if (sysctl_ip_vs_expire_nodest_conn) { + if (sysctl_expire_nodest_conn(ipvs)) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } @@ -1328,9 +1690,9 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, } ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); + ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); @@ -1342,25 +1704,76 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ - atomic_inc(&cp->in_pkts); - if (af == AF_INET && - (ip_vs_sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); - cp->old_state = cp->state; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = sysctl_sync_threshold(ipvs); + else + pkts = atomic_add_return(1, &cp->in_pkts); + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, pkts); ip_vs_conn_put(cp); return ret; } +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET); +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET6); +} + +#endif + /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP @@ -1372,109 +1785,248 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, * and send them to ip_vs_in_icmp. */ static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; + struct netns_ipvs *ipvs; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; - return ip_vs_in_icmp(skb, &r, hooknum); + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, ops->hooknum); } #ifdef CONFIG_IP_VS_IPV6 static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; + struct netns_ipvs *ipvs; + struct ip_vs_iphdr iphdr; - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; - return ip_vs_in_icmp_v6(skb, &r, hooknum); + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); } #endif static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 2, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_in, + .hook = ip_vs_remote_request4, .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 1, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply4, .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_FORWARD, + .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ + /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_post_routing, + .hook = ip_vs_reply4, .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_FORWARD, + .priority = 100, }, #ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 2, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_in, + .hook = ip_vs_remote_request6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 1, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_out, + .hook = ip_vs_local_request6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 99, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_FORWARD, + .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ + /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_post_routing, + .hook = ip_vs_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_FORWARD, + .priority = 100, }, #endif }; +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) + return -ENOMEM; + + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; + ipvs->net = net; + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + + if (ip_vs_estimator_net_init(net) < 0) + goto estimator_fail; + + if (ip_vs_control_net_init(net) < 0) + goto control_fail; + + if (ip_vs_protocol_net_init(net) < 0) + goto protocol_fail; + + if (ip_vs_app_net_init(net) < 0) + goto app_fail; + + if (ip_vs_conn_net_init(net) < 0) + goto conn_fail; + + if (ip_vs_sync_net_init(net) < 0) + goto sync_fail; + + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", + sizeof(struct netns_ipvs), ipvs->gen); + return 0; +/* + * Error handling + */ + +sync_fail: + ip_vs_conn_net_cleanup(net); +conn_fail: + ip_vs_app_net_cleanup(net); +app_fail: + ip_vs_protocol_net_cleanup(net); +protocol_fail: + ip_vs_control_net_cleanup(net); +control_fail: + ip_vs_estimator_net_cleanup(net); +estimator_fail: + net->ipvs = NULL; + return -ENOMEM; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ + ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(net); + ip_vs_app_net_cleanup(net); + ip_vs_protocol_net_cleanup(net); + ip_vs_control_net_cleanup(net); + ip_vs_estimator_net_cleanup(net); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; +} + +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(net); + LeaveFunction(2); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit = __ip_vs_cleanup, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; /* * Initialize IP Virtual Server @@ -1483,58 +2035,69 @@ static int __init ip_vs_init(void) { int ret; - ip_vs_estimator_init(); - ret = ip_vs_control_init(); if (ret < 0) { - IP_VS_ERR("can't setup control.\n"); - goto cleanup_estimator; + pr_err("can't setup control.\n"); + goto exit; } ip_vs_protocol_init(); - ret = ip_vs_app_init(); + ret = ip_vs_conn_init(); if (ret < 0) { - IP_VS_ERR("can't setup application helper.\n"); + pr_err("can't setup connection table.\n"); goto cleanup_protocol; } - ret = ip_vs_conn_init(); + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_conn; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { - IP_VS_ERR("can't setup connection table.\n"); - goto cleanup_app; + pr_err("can't register hooks.\n"); + goto cleanup_dev; } - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ret = ip_vs_register_nl_ioctl(); if (ret < 0) { - IP_VS_ERR("can't register hooks.\n"); - goto cleanup_conn; + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; } - IP_VS_INFO("ipvs loaded.\n"); + pr_info("ipvs loaded.\n"); + return ret; - cleanup_conn: +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); +cleanup_conn: ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: +cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); - cleanup_estimator: - ip_vs_estimator_cleanup(); +exit: return ret; } static void __exit ip_vs_cleanup(void) { + ip_vs_unregister_nl_ioctl(); nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); - ip_vs_estimator_cleanup(); - IP_VS_INFO("ipvs unloaded.\n"); + pr_info("ipvs unloaded.\n"); } module_init(ip_vs_init); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e01061f49cd..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -18,6 +18,9 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> @@ -28,12 +31,14 @@ #include <linux/workqueue.h> #include <linux/swap.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> #include <net/net_namespace.h> +#include <linux/nsproxy.h> #include <net/ip.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> @@ -50,41 +55,7 @@ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - /* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; - #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; @@ -95,31 +66,35 @@ int ip_vs_get_debug_level(void) } #endif + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); + + #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) -{ - struct rt6_info *rt; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = *addr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, +static bool __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) +{ + struct flowi6 fl6 = { + .daddr = *addr, }; + struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); + bool is_local; - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) - return 1; + is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); - return 0; + dst_release(dst); + return is_local; } #endif + +#ifdef CONFIG_SYSCTL /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ -static void update_defense_level(void) +static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; static int old_secure_tcp = 0; @@ -135,73 +110,73 @@ static void update_defense_level(void) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < sysctl_ip_vs_amemthresh); + nomem = (availmem < ipvs->sysctl_amemthresh); local_bh_disable(); /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { case 0: - atomic_set(&ip_vs_dropentry, 0); + atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; } else { - atomic_set(&ip_vs_dropentry, 0); + atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { - atomic_set(&ip_vs_dropentry, 1); + atomic_set(&ipvs->dropentry, 1); } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; }; break; case 3: - atomic_set(&ip_vs_dropentry, 1); + atomic_set(&ipvs->dropentry, 1); break; } - spin_unlock(&__ip_vs_dropentry_lock); + spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { case 0: - ip_vs_drop_rate = 0; + ipvs->drop_rate = 0; break; case 1: if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; } else { - ip_vs_drop_rate = 0; + ipvs->drop_rate = 0; } break; case 2: if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; } break; case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } - spin_unlock(&__ip_vs_droppacket_lock); + spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { case 0: if (old_secure_tcp >= 2) to_change = 0; @@ -210,7 +185,7 @@ static void update_defense_level(void) if (nomem) { if (old_secure_tcp < 2) to_change = 1; - sysctl_ip_vs_secure_tcp = 2; + ipvs->sysctl_secure_tcp = 2; } else { if (old_secure_tcp >= 2) to_change = 0; @@ -223,7 +198,7 @@ static void update_defense_level(void) } else { if (old_secure_tcp >= 2) to_change = 0; - sysctl_ip_vs_secure_tcp = 1; + ipvs->sysctl_secure_tcp = 1; } break; case 3: @@ -231,10 +206,11 @@ static void update_defense_level(void) to_change = 1; break; } - old_secure_tcp = sysctl_ip_vs_secure_tcp; + old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } @@ -244,17 +220,18 @@ static void update_defense_level(void) * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); static void defense_work_handler(struct work_struct *work) { - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + update_defense_level(ipvs); + if (atomic_read(&ipvs->dropentry)) + ip_vs_random_dropentry(ipvs->net); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } +#endif int ip_vs_use_count_inc(void) @@ -277,87 +254,70 @@ ip_vs_use_count_dec(void) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, - __be16 port) +static inline unsigned int +ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, + const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; + __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif + ahash = ntohl(addr_fold); + ahash ^= ((size_t) net >> 8); - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) { - return fwmark & IP_VS_SVC_TAB_MASK; + return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* - * Hashes a service in the ip_vs_svc_table by <proto,addr,port> + * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> * or in the ip_vs_svc_fwm_table by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { - unsigned hash; + unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { - IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* - * Hash it by <protocol,addr,port> in ip_vs_svc_table + * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, - svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); + hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + &svc->addr, svc->port); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* - * Hash it by fwmark in ip_vs_svc_fwm_table + * Hash it by fwmark in svc_fwm_table */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -368,23 +328,23 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Unhashes a service from svc_table / svc_fwm_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " - "called from %p\n", __builtin_return_address(0)); + pr_err("%s(): request for unhash flagged, called from %pF\n", + __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ - list_del(&svc->s_list); + /* Remove it from the svc_table table */ + hlist_del_rcu(&svc->s_list); } else { - /* Remove it from the ip_vs_svc_fwm_table table */ - list_del(&svc->f_list); + /* Remove it from the svc_fwm_table table */ + hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; @@ -394,25 +354,25 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* - * Get service by {proto,addr,port} in the service table. + * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, - __be16 vport) +__ip_vs_service_find(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) - && (svc->protocol == protocol)) { + && (svc->protocol == protocol) + && net_eq(svc->net, net)) { /* HIT */ - atomic_inc(&svc->usecnt); return svc; } } @@ -425,18 +385,18 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_get(int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); + hash = ip_vs_svc_fwm_hashkey(net, fwmark); - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af + && net_eq(svc->net, net)) { /* HIT */ - atomic_inc(&svc->usecnt); return svc; } } @@ -444,48 +404,49 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark) return NULL; } +/* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; - - read_lock(&__ip_vs_svc_lock); + struct netns_ipvs *ipvs = net_ipvs(net); /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) - goto out; + if (fwmark) { + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (svc) + goto out; + } /* * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ - svc = __ip_vs_service_get(af, protocol, vaddr, vport); + svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) + && atomic_read(&ipvs->ftpsvc_counter) && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); } if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { + && atomic_read(&ipvs->nullsvc_counter)) { /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_get(af, protocol, vaddr, 0); + svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); } out: - read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), @@ -499,28 +460,47 @@ static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); - dest->svc = svc; + rcu_assign_pointer(dest->svc, svc); } -static inline void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) +static void ip_vs_service_free(struct ip_vs_service *svc) { - struct ip_vs_service *svc = dest->svc; + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); +} - dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} + +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{ + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + if (do_delay) + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); + else + ip_vs_service_free(svc); + } } /* * Returns hash value for real service */ -static inline unsigned ip_vs_rs_hashkey(int af, +static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -533,17 +513,13 @@ static inline unsigned ip_vs_rs_hashkey(int af, & IP_VS_RTAB_MASK; } -/* - * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { - unsigned hash; + unsigned int hash; - if (!list_empty(&dest->d_list)) { - return 0; - } + if (dest->in_rs_table) + return; /* * Hash by proto,addr,port, @@ -551,64 +527,51 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; } -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* - * Remove it from the ip_vs_rtable table. + * Remove it from the rs_table table. */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; } - - return 1; } -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) { - unsigned hash; + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned int hash; struct ip_vs_dest *dest; - /* - * Check for "full" addressed entries - * Return the first found entry - */ + /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { + rcu_read_lock(); + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; + rcu_read_unlock(); + return true; } } - read_unlock(&__ip_vs_rs_lock); + rcu_read_unlock(); - return NULL; + return false; } -/* - * Lookup destination by {addr,port} in the given service +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, @@ -619,7 +582,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find the destination for the given service */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == svc->af) && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && (dest->port == dport)) { @@ -633,32 +596,56 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in + * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. + * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, + const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol) + __be16 vport, __u16 protocol, __u32 fwmark, + __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; + __be16 port = dport; - svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); + svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); + if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) + port = 0; + dest = ip_vs_lookup_dest(svc, daddr, port); + if (!dest) + dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); return dest; } +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed @@ -673,12 +660,14 @@ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { - struct ip_vs_dest *dest, *nxt; + struct ip_vs_dest *dest; + struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, @@ -694,28 +683,29 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); + list_del(&dest->t_list); + ip_vs_dest_hold(dest); + goto out; } } - return NULL; + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; } +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + + __ip_vs_dst_cache_reset(dest); + __ip_vs_svc_put(svc, false); + free_percpu(dest->stats.cpustats); + ip_vs_dest_put_and_free(dest); +} /* * Clean up all the destinations in the trash @@ -724,27 +714,54 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. + * be 0, so we simply release them here. */ -static void ip_vs_trash_cleanup(void) +static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); } } +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c + + spin_lock_bh(&src->lock); + + IP_VS_SHOW_STATS_COUNTER(conns); + IP_VS_SHOW_STATS_COUNTER(inpkts); + IP_VS_SHOW_STATS_COUNTER(outpkts); + IP_VS_SHOW_STATS_COUNTER(inbytes); + IP_VS_SHOW_STATS_COUNTER(outbytes); + + ip_vs_read_estimator(dst, src); + + spin_unlock_bh(&src->lock); +} static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock_bh(&stats->lock); - memset(&stats->ustats, 0, sizeof(stats->ustats)); + /* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c + + IP_VS_ZERO_STATS_COUNTER(conns); + IP_VS_ZERO_STATS_COUNTER(inpkts); + IP_VS_ZERO_STATS_COUNTER(outpkts); + IP_VS_ZERO_STATS_COUNTER(inbytes); + IP_VS_ZERO_STATS_COUNTER(outbytes); + ip_vs_zero_estimator(stats); spin_unlock_bh(&stats->lock); @@ -754,51 +771,40 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) * Update a destination in the given service */ static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) +__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_vs_dest_user_kern *udest, int add) { + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_service *old_svc; + struct ip_vs_scheduler *sched; int conn_flags; /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; - - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } + conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; + conn_flags |= IP_VS_CONN_F_INACTIVE; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* - * Put the real service in ip_vs_rtable if not present. + * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(ipvs, dest); } atomic_set(&dest->conn_flags, conn_flags); /* bind the service */ - if (!dest->svc) { + old_svc = rcu_dereference_protected(dest->svc, 1); + if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); + if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); + __ip_vs_svc_put(old_svc, true); } } @@ -809,6 +815,22 @@ __ip_vs_update_dest(struct ip_vs_service *svc, dest->flags &= ~IP_VS_DEST_F_OVERLOAD; dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + + spin_lock_bh(&dest->dst_lock); + __ip_vs_dst_cache_reset(dest); + spin_unlock_bh(&dest->dst_lock); + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (add) { + ip_vs_start_estimator(svc->net, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); + svc->num_dests++; + if (sched->add_dest) + sched->add_dest(svc, dest); + } else { + if (sched->upd_dest) + sched->upd_dest(svc, dest); + } } @@ -820,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { struct ip_vs_dest *dest; - unsigned atype; + unsigned int atype, i; EnterFunction(2); @@ -829,20 +851,28 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) return -EINVAL; } else #endif { - atype = inet_addr_type(&init_net, udest->addr.ip); + atype = inet_addr_type(svc->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); - if (dest == NULL) { - IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); + if (dest == NULL) return -ENOMEM; + + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) + goto err_alloc; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_dest_stats; + ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); + u64_stats_init(&ip_vs_dest_stats->syncp); } dest->af = svc->af; @@ -856,18 +886,21 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); + atomic_set(&dest->refcnt, 1); - INIT_LIST_HEAD(&dest->d_list); + INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); + __ip_vs_update_dest(svc, dest, udest, 1); *dest_p = dest; LeaveFunction(2); return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; } @@ -885,25 +918,25 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); if (udest->weight < 0) { - IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " - "upper threshold\n"); + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); return -ERANGE; } ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Check if the dest already exists in the list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest != NULL) { - IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + IP_VS_DBG(1, "%s(): dest already exists\n", __func__); return -EEXIST; } @@ -922,65 +955,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - + __ip_vs_update_dest(svc, dest, udest, 1); + ret = 0; + } else { /* - * Wait until all other svc users go away. + * Allocate and initialize the dest structure */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; + ret = ip_vs_new_dest(svc, udest, &dest); } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - LeaveFunction(2); - return 0; + return ret; } @@ -997,84 +982,61 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); if (udest->weight < 0) { - IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " - "upper threshold\n"); + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); return -ERANGE; } ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Lookup the destination list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); return -ENOENT; } - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - + __ip_vs_update_dest(svc, dest, udest, 0); LeaveFunction(2); return 0; } - /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, + bool cleanup) { - ip_vs_kill_estimator(&dest->stats); + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_stop_estimator(net, &dest->stats); /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&__ip_vs_rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); - atomic_inc(&dest->refcnt); - } + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash without reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = 0; + spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_dest_put(dest); } @@ -1090,14 +1052,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, /* * Remove it from the d-linked destination list. */ - list_del(&dest->n_list); + list_del_rcu(&dest->n_list); svc->num_dests--; - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched->del_dest) + sched->del_dest(svc, dest); + } } @@ -1112,48 +1076,75 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + rcu_read_unlock(); if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); - write_unlock_bh(&__ip_vs_svc_lock); - /* * Delete the destination */ - __ip_vs_del_dest(dest); + __ip_vs_del_dest(svc->net, dest, false); LeaveFunction(2); return 0; } +static void ip_vs_dest_trash_expire(unsigned long data) +{ + struct net *net = (struct net *) data; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest *dest, *next; + unsigned long now = jiffies; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + if (atomic_read(&dest->refcnt) > 0) + continue; + if (dest->idle_start) { + if (time_before(now, dest->idle_start + + IP_VS_DEST_TRASH_PERIOD)) + continue; + } else { + dest->idle_start = max(1UL, now); + continue; + } + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + spin_unlock(&ipvs->dest_trash_lock); +} /* * Add a service into the service hash table */ static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0; + int ret = 0, i; struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); @@ -1161,28 +1152,52 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, /* Lookup the scheduler by 'u->sched_name' */ sched = ip_vs_scheduler_get(u->sched_name); if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); ret = -ENOENT; - goto out_mod_dec; + goto out_err; + } + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out_err; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } } #endif - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { - IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; goto out_err; } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) { + ret = -ENOMEM; + goto out_err; + } + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_stats; + ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); + u64_stats_init(&ip_vs_stats->syncp); + } + /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); atomic_set(&svc->refcnt, 0); svc->af = u->af; @@ -1193,9 +1208,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, svc->flags = u->flags; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; + svc->net = net; INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ @@ -1204,40 +1220,39 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_err; sched = NULL; + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + /* Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); + atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); + atomic_inc(&ipvs->nullsvc_counter); - ip_vs_new_estimator(&svc->stats); + ip_vs_start_estimator(net, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) - ip_vs_num_services++; + ipvs->num_services++; /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; return 0; - out_err: + + out_err: if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - kfree(svc); + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); - out_mod_dec: /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1252,6 +1267,7 @@ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* @@ -1259,25 +1275,44 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) */ sched = ip_vs_scheduler_get(u->sched_name); if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); return -ENOENT; } old_sched = sched; + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } } #endif - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + /* Bind the new scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + old_sched = sched; + goto out; + } + /* Unbind the old scheduler on success */ + ip_vs_unbind_scheduler(svc, old_sched); + } /* * Set the flags and timeout value @@ -1286,126 +1321,93 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 - out: -#endif - - if (old_sched) - ip_vs_scheduler_put(old_sched); + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) + rcu_assign_pointer(svc->pe, pe); +out: + ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); return ret; } - /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ -static void __ip_vs_del_service(struct ip_vs_service *svc) +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + pr_info("%s: enter\n", __func__); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) - ip_vs_num_services--; + ipvs->num_services--; - ip_vs_kill_estimator(&svc->stats); + ip_vs_stop_estimator(svc->net, &svc->stats); /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); + ip_vs_scheduler_put(old_sched); - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); + ip_vs_pe_put(old_pe); /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); + __ip_vs_del_dest(svc->net, dest, cleanup); } /* * Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); + atomic_dec(&ipvs->ftpsvc_counter); else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); + atomic_dec(&ipvs->nullsvc_counter); /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); + __ip_vs_svc_put(svc, true); /* decrease the module use count */ ip_vs_use_count_dec(); } /* - * Delete a service from the service list + * Unlink a service from list and try to delete it if its refcnt reached 0 */ -static int ip_vs_del_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { - if (svc == NULL) - return -EEXIST; - + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - __ip_vs_del_service(svc); + __ip_vs_del_service(svc, cleanup); +} - write_unlock_bh(&__ip_vs_svc_lock); +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + ip_vs_unlink_service(svc, false); return 0; } @@ -1414,24 +1416,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(void) +static int ip_vs_flush(struct net *net, bool cleanup) { int idx; - struct ip_vs_service *svc, *nxt; + struct ip_vs_service *svc; + struct hlist_node *n; /* - * Flush the service table hashed by <protocol,addr,port> + * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc, cleanup); } } @@ -1439,22 +1437,97 @@ static int ip_vs_flush(void) * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc, cleanup); } } return 0; } +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void ip_vs_service_net_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net, true); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} + +/* Put all references for device (dst_cache) */ +static inline void +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) +{ + struct ip_vs_dest_dst *dest_dst; + + spin_lock_bh(&dest->dst_lock); + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + __ip_vs_dst_cache_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_DOWN || !ipvs) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); + } + } + } + + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); + } + } + + } + } + + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); + } + spin_unlock_bh(&ipvs->dest_trash_lock); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} /* * Zero counters in a service or all services @@ -1463,60 +1536,64 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; - write_lock_bh(&__ip_vs_svc_lock); list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); return 0; } -static int ip_vs_zero_all(void) +static int ip_vs_zero_all(struct net *net) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); } } - ip_vs_zero_stats(&ip_vs_stats); + ip_vs_zero_stats(&net_ipvs(net)->tot_stats); return 0; } +#ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3; static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, +proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = current->nsproxy->net_ns; int *valp = table->data; int val = *valp; int rc; - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + rc = proc_dointvec(table, write, buffer, lenp, ppos); if (write && (*valp != val)) { if ((*valp < 0) || (*valp > 3)) { /* Restore the correct value */ *valp = val; } else { - update_defense_level(); + update_defense_level(net_ipvs(net)); } } return rc; } - static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, +proc_do_sync_threshold(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1526,64 +1603,210 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, /* backup the value first */ memcpy(val, valp, sizeof(val)); - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || + (valp[0] >= valp[1] && valp[1]))) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); } return rc; } +static int +proc_do_sync_mode(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 1)) { + /* Restore the correct value */ + *valp = val; + } + } + return rc; +} + +static int +proc_do_sync_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (*valp < 1 || !is_power_of_2(*valp)) { + /* Restore the correct value */ + *valp = val; + } + } + return rc; +} /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in ip_vs_control_net_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, + { + .procname = "snat_reroute", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, + { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_ports, + }, + { + .procname = "sync_persist_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_sctp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + { + .procname = "nat_icmp_send", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "pmtu_disc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #if 0 { .procname = "timeout_established", @@ -1670,58 +1893,16 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec_jiffies, }, #endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .ctl_name = 0 } -}; - -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "vs", }, { } }; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); -static struct ctl_table_header * sysctl_header; +#endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { - struct list_head *table; + struct seq_net_private p; /* Do not move this, netns depends upon it*/ + struct hlist_head *table; int bucket; }; @@ -1729,7 +1910,7 @@ struct ip_vs_iter { * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ -static inline const char *ip_vs_fwd_name(unsigned flags) +static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: @@ -1747,14 +1928,15 @@ static inline const char *ip_vs_fwd_name(unsigned flags) /* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { + struct net *net = seq_file_net(seq); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; @@ -1764,8 +1946,9 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { + if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; @@ -1777,17 +1960,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) + __acquires(RCU) { - - read_lock_bh(&__ip_vs_svc_lock); + rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *e; + struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; @@ -1800,13 +1982,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { return svc; } } @@ -1817,13 +2000,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) return svc; } @@ -1831,9 +2016,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) + __releases(RCU) { - read_unlock_bh(&__ip_vs_svc_lock); + rcu_read_unlock(); } @@ -1842,7 +2027,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, @@ -1851,6 +2036,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1859,17 +2045,19 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - svc->scheduler->name); + sched->name); else #endif - seq_printf(seq, "%s %08X:%04X %s ", + seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - svc->scheduler->name); + sched->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); + seq_printf(seq, "FWM %08X %s %s", + svc->fwmark, sched->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } if (svc->flags & IP_VS_SVC_F_PERSISTENT) @@ -1879,7 +2067,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) else seq_putc(seq, '\n'); - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, @@ -1917,7 +2105,7 @@ static const struct seq_operations ip_vs_info_seq_ops = { static int ip_vs_info_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ip_vs_info_seq_ops, + return seq_open_net(inode, file, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)); } @@ -1926,18 +2114,13 @@ static const struct file_operations ip_vs_info_fops = { .open = ip_vs_info_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), + .release = seq_release_net, }; -#ifdef CONFIG_PROC_FS static int ip_vs_stats_show(struct seq_file *seq, void *v) { + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats_user show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -1945,29 +2128,25 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_printf(seq, " Conns Packets Packets Bytes Bytes\n"); - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, - ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, - (unsigned long long) ip_vs_stats.ustats.inbytes, - (unsigned long long) ip_vs_stats.ustats.outbytes); + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, + show.inpkts, show.outpkts, + (unsigned long long) show.inbytes, + (unsigned long long) show.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.ustats.cps, - ip_vs_stats.ustats.inpps, - ip_vs_stats.ustats.outpps, - ip_vs_stats.ustats.inbps, - ip_vs_stats.ustats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); + seq_printf(seq, "%8X %8X %8X %16X %16X\n", + show.cps, show.inpps, show.outpps, + show.inbps, show.outbps); return 0; } static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) { - return single_open(file, ip_vs_stats_show, NULL); + return single_open_net(inode, file, ip_vs_stats_show); } static const struct file_operations ip_vs_stats_fops = { @@ -1975,16 +2154,88 @@ static const struct file_operations ip_vs_stats_fops = { .open = ip_vs_stats_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; + struct ip_vs_stats_user rates; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); + unsigned int start; + __u64 inbytes, outbytes; + + do { + start = u64_stats_fetch_begin_irq(&u->syncp); + inbytes = u->ustats.inbytes; + outbytes = u->ustats.outbytes; + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); + + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)inbytes, + (__u64)outbytes); + } + + spin_lock_bh(&tot_stats->lock); + + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", + tot_stats->ustats.conns, tot_stats->ustats.inpkts, + tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); + + ip_vs_read_estimator(&rates, tot_stats); + + spin_unlock_bh(&tot_stats->lock); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8X %8X %8X %16X %16X\n", + rates.cps, + rates.inpps, + rates.outpps, + rates.inbps, + rates.outbps); + + return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; #endif /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) { +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, @@ -1992,19 +2243,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif @@ -2037,6 +2291,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { + memset(usvc, 0, sizeof(*usvc)); + usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; @@ -2054,6 +2310,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { + memset(udest, 0, sizeof(*udest)); + udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; @@ -2065,6 +2323,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, static int do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { + struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_ARG_LEN]; struct ip_vs_service_user *usvc_compat; @@ -2072,13 +2331,18 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ip_vs_service *svc; struct ip_vs_dest_user *udest_compat; struct ip_vs_dest_user_kern udest; + struct netns_ipvs *ipvs = net_ipvs(net); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) + return -EINVAL; + if (len < 0 || len > MAX_ARG_LEN) + return -EINVAL; if (len != set_arglen[SET_CMDID(cmd)]) { - IP_VS_ERR("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); + pr_err("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); return -EINVAL; } @@ -2088,6 +2352,24 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* increase the module use count */ ip_vs_use_count_inc(); + /* Handle daemons since they have another lock */ + if (cmd == IP_VS_SO_SET_STARTDAEMON || + cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + + if (mutex_lock_interruptible(&ipvs->sync_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + if (cmd == IP_VS_SO_SET_STARTDAEMON) + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); + else + ret = stop_sync_thread(net, dm->state); + mutex_unlock(&ipvs->sync_mutex); + goto out_dec; + } + if (mutex_lock_interruptible(&__ip_vs_mutex)) { ret = -ERESTARTSYS; goto out_dec; @@ -2095,19 +2377,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(); + ret = ip_vs_flush(net, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); + ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); goto out_unlock; } @@ -2122,26 +2396,29 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); + ret = ip_vs_zero_all(net); goto out_unlock; } } - /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { - IP_VS_ERR("set_ctl: invalid protocol: %d %pI4:%d %s\n", - usvc.protocol, &usvc.addr.ip, - ntohs(usvc.port), usvc.sched_name); + /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && + usvc.protocol != IPPROTO_SCTP) { + pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n", + usvc.protocol, &usvc.addr.ip, + ntohs(usvc.port), usvc.sched_name); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by <protocol, addr, port> or fwmark */ + rcu_read_lock(); if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); + svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2154,7 +2431,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(&usvc, &svc); + ret = ip_vs_add_service(net, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); @@ -2180,9 +2457,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; } - if (svc) - ip_vs_service_put(svc); - out_unlock: mutex_unlock(&__ip_vs_mutex); out_dec: @@ -2194,21 +2468,16 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, &src->ustats, sizeof(*dst)); - spin_unlock_bh(&src->lock); -} - -static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2217,7 +2486,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) } static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, +__ip_vs_get_service_entries(struct net *net, + const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; @@ -2226,9 +2496,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) + if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; if (count >= get->num_services) @@ -2245,9 +2515,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) + if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; if (count >= get->num_services) @@ -2262,29 +2532,32 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, count++; } } - out: +out: return ret; } static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; + rcu_read_lock(); if (get->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else - svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, - get->port); + svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + get->port); + rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; + memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; @@ -2306,24 +2579,29 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, } count++; } - ip_vs_service_put(svc); } else ret = -ESRCH; return ret; } static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) { +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + memset(u, 0, sizeof (*u)); + #ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } @@ -2351,18 +2629,56 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { unsigned char arg[128]; int ret = 0; + unsigned int copylen; + struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); - if (!capable(CAP_NET_ADMIN)) + BUG_ON(!net); + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) + return -EINVAL; + if (*len < get_arglen[GET_CMDID(cmd)]) { - IP_VS_ERR("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); + pr_err("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); return -EINVAL; } - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + copylen = get_arglen[GET_CMDID(cmd)]; + if (copylen > 128) + return -EINVAL; + + if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; + /* + * Handle daemons first since it has its own locking + */ + if (cmd == IP_VS_SO_GET_DAEMON) { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (mutex_lock_interruptible(&ipvs->sync_mutex)) + return -ERESTARTSYS; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; + } + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + mutex_unlock(&ipvs->sync_mutex); + return ret; + } if (mutex_lock_interruptible(&__ip_vs_mutex)) return -ERESTARTSYS; @@ -2373,7 +2689,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; @@ -2386,8 +2702,8 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; - info.num_services = ip_vs_num_services; + info.size = ip_vs_conn_tab_size; + info.num_services = ipvs->num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } @@ -2402,11 +2718,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) size = sizeof(*get) + sizeof(struct ip_vs_service_entry) * get->num_services; if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); + pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } - ret = __ip_vs_get_service_entries(get, user); + ret = __ip_vs_get_service_entries(net, get, user); } break; @@ -2418,16 +2734,18 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; + rcu_read_lock(); if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else - svc = __ip_vs_service_get(AF_INET, entry->protocol, - &addr, entry->port); + svc = __ip_vs_service_find(net, AF_INET, + entry->protocol, &addr, + entry->port); + rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; - ip_vs_service_put(svc); } else ret = -ESRCH; } @@ -2442,11 +2760,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) size = sizeof(*get) + sizeof(struct ip_vs_dest_entry) * get->num_dests; if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); + pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } - ret = __ip_vs_get_dest_entries(get, user); + ret = __ip_vs_get_dest_entries(net, get, user); } break; @@ -2454,37 +2772,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } break; - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; - } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - default: ret = -EINVAL; } - out: +out: mutex_unlock(&__ip_vs_mutex); return ret; } @@ -2512,6 +2810,7 @@ static struct genl_family ip_vs_genl_family = { .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ }; /* Policy used for first-level command attributes */ @@ -2542,6 +2841,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, @@ -2567,31 +2868,29 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_stats *stats) { + struct ip_vs_stats_user ustats; struct nlattr *nl_stats = nla_nest_start(skb, container_type); if (!nl_stats) return -EMSGSIZE; - spin_lock_bh(&stats->lock); - - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); - - spin_unlock_bh(&stats->lock); - + ip_vs_copy_stats(&ustats, stats); + + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: - spin_unlock_bh(&stats->lock); nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } @@ -2599,6 +2898,8 @@ nla_put_failure: static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; @@ -2607,21 +2908,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, if (!nl_service) return -EMSGSIZE; - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - + if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) + goto nla_put_failure; if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) + goto nla_put_failure; } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || + nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + goto nla_put_failure; } - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - + sched = rcu_dereference_protected(svc->scheduler, 1); + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || + nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || + nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) goto nla_put_failure; @@ -2640,7 +2946,7 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb, { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) @@ -2662,11 +2968,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; + struct net *net = skb_sknet(skb); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start) + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2676,8 +2983,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start) + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2693,11 +3000,14 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, - struct nlattr *nla, int full_entry) +static int ip_vs_genl_parse_service(struct net *net, + struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry, + struct ip_vs_service **ret_svc) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + struct ip_vs_service *svc; /* Parse mandatory identifying service fields first */ if (nla == NULL || @@ -2713,6 +3023,8 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; + memset(usvc, 0, sizeof(*usvc)); + usvc->af = nla_get_u16(nla_af); #ifdef CONFIG_IP_VS_IPV6 if (usvc->af != AF_INET && usvc->af != AF_INET6) @@ -2727,18 +3039,27 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); + usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } + rcu_read_lock(); + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + rcu_read_unlock(); + *ret_svc = svc; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { - struct nlattr *nla_sched, *nla_flags, *nla_timeout, + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; - struct ip_vs_service *svc; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; @@ -2749,42 +3070,30 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, nla_memcpy(&flags, nla_flags, sizeof(flags)); /* prefill flags from service if it already exists */ - if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); - else - svc = __ip_vs_service_get(usvc->af, usvc->protocol, - &usvc->addr, usvc->port); - if (svc) { + if (svc) usvc->flags = svc->flags; - ip_vs_service_put(svc); - } else - usvc->flags = 0; /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); + usvc->netmask = nla_get_be32(nla_netmask); } return 0; } -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, + struct nlattr *nla) { struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(&usvc, nla, 0); - if (ret) - return ERR_PTR(ret); - - if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - else - return __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); + ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + return ret ? ERR_PTR(ret) : svc; } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) @@ -2795,21 +3104,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) if (!nl_dest) return -EMSGSIZE; - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - + if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, + (atomic_read(&dest->conn_flags) & + IP_VS_CONN_F_FWD_MASK)) || + nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, + atomic_read(&dest->weight)) || + nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns))) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -2827,7 +3137,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) @@ -2851,6 +3161,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net = skb_sknet(skb); mutex_lock(&__ip_vs_mutex); @@ -2859,7 +3170,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) goto out_err; - svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -2875,7 +3187,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, nla_put_failure: cb->args[0] = idx; - ip_vs_service_put(svc); out_err: mutex_unlock(&__ip_vs_mutex); @@ -2900,8 +3211,10 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, if (!(nla_addr && nla_port)) return -EINVAL; + memset(udest, 0, sizeof(*udest)); + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); + udest->port = nla_get_be16(nla_port); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -2926,8 +3239,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, return 0; } -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid) { struct nlattr *nl_daemon; @@ -2935,10 +3248,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, if (!nl_daemon) return -EMSGSIZE; - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - + if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -2948,12 +3261,12 @@ nla_put_failure: return -EMSGSIZE; } -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid, struct netlink_callback *cb) { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) @@ -2972,56 +3285,61 @@ nla_put_failure: static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { - mutex_lock(&__ip_vs_mutex); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + struct net *net = skb_sknet(skb); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ip_vs_master_mcast_ifn, - ip_vs_master_syncid, cb) < 0) + ipvs->master_mcast_ifn, + ipvs->master_syncid, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } - if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ip_vs_backup_mcast_ifn, - ip_vs_backup_syncid, cb) < 0) + ipvs->backup_mcast_ifn, + ipvs->backup_syncid, cb) < 0) goto nla_put_failure; cb->args[1] = 1; } nla_put_failure: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->sync_mutex); return skb->len; } -static int ip_vs_genl_new_daemon(struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) { if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; - return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + return start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); } -static int ip_vs_genl_del_daemon(struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) { if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + return stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); } -static int ip_vs_genl_set_config(struct nlattr **attrs) +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3033,32 +3351,23 @@ static int ip_vs_genl_set_config(struct nlattr **attrs) if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - return ip_vs_set_timeout(&t); + return ip_vs_set_timeout(net, &t); } -static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { - struct ip_vs_service *svc = NULL; - struct ip_vs_service_user_kern usvc; - struct ip_vs_dest_user_kern udest; int ret = 0, cmd; - int need_full_svc = 0, need_full_dest = 0; + struct net *net; + struct netns_ipvs *ipvs; + net = skb_sknet(skb); + ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; - mutex_lock(&__ip_vs_mutex); - - if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(); - goto out; - } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(info->attrs); - goto out; - } else if (cmd == IPVS_CMD_NEW_DAEMON || - cmd == IPVS_CMD_DEL_DAEMON) { - + if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + mutex_lock(&ipvs->sync_mutex); if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], @@ -3068,13 +3377,38 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(daemon_attrs); + ret = ip_vs_genl_new_daemon(net, daemon_attrs); else - ret = ip_vs_genl_del_daemon(daemon_attrs); + ret = ip_vs_genl_del_daemon(net, daemon_attrs); +out: + mutex_unlock(&ipvs->sync_mutex); + } + return ret; +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + struct net *net; + + net = skb_sknet(skb); + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(net, false); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(net, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); + ret = ip_vs_zero_all(net); goto out; } @@ -3084,19 +3418,12 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = 1; - ret = ip_vs_genl_parse_service(&usvc, + ret = ip_vs_genl_parse_service(net, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], - need_full_svc); + need_full_svc, &svc); if (ret) goto out; - /* Lookup the exact service by <protocol, addr, port> or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { ret = -ESRCH; @@ -3121,7 +3448,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) - ret = ip_vs_add_service(&usvc, &svc); + ret = ip_vs_add_service(net, &usvc, &svc); else ret = -EEXIST; break; @@ -3130,6 +3457,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) break; case IPVS_CMD_DEL_SERVICE: ret = ip_vs_del_service(svc); + /* do not use svc, it can be freed */ break; case IPVS_CMD_NEW_DEST: ret = ip_vs_add_dest(svc, &udest); @@ -3148,8 +3476,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } out: - if (svc) - ip_vs_service_put(svc); mutex_unlock(&__ip_vs_mutex); return ret; @@ -3160,7 +3486,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; + struct net *net; + net = skb_sknet(skb); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3170,7 +3498,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) else if (cmd == IPVS_CMD_GET_CONFIG) reply_cmd = IPVS_CMD_SET_CONFIG; else { - IP_VS_ERR("unknown Generic Netlink command\n"); + pr_err("unknown Generic Netlink command\n"); return -EINVAL; } @@ -3189,13 +3517,13 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc; - svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + svc = ip_vs_genl_find_service(net, + info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); - ip_vs_service_put(svc); if (ret) goto nla_put_failure; } else { @@ -3210,32 +3538,37 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); #ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, + t.tcp_timeout) || + nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout)) + goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) + goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); + if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, + IP_VS_VERSION_CODE) || + nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size)) + goto nla_put_failure; break; } genlmsg_end(msg, reply); - ret = genlmsg_unicast(msg, info->snd_pid); + ret = genlmsg_reply(msg, info); goto out; nla_put_failure: - IP_VS_ERR("not enough space in Netlink message\n"); + pr_err("not enough space in Netlink message\n"); ret = -EMSGSIZE; out_err: @@ -3247,7 +3580,7 @@ out: } -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .flags = GENL_ADMIN_PERM, @@ -3301,13 +3634,13 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = { .cmd = IPVS_CMD_NEW_DAEMON, .flags = GENL_ADMIN_PERM, .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, + .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, .flags = GENL_ADMIN_PERM, .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, + .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, @@ -3345,70 +3678,223 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = { static int __init ip_vs_genl_register(void) { - int ret, i; + return genl_register_family_with_ops(&ip_vs_genl_family, + ip_vs_genl_ops); +} - ret = genl_register_family(&ip_vs_genl_family); - if (ret) - return ret; +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} - for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { - ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); - if (ret) - goto err_out; +/* End of Generic Netlink interface definitions */ + +/* + * per netns intit/exit func. + */ +#ifdef CONFIG_SYSCTL +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ctl_table *tbl; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; + tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; + ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + ipvs->sysctl_pmtu_disc = 1; + tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; + + + ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); + if (ipvs->sysctl_hdr == NULL) { + if (!net_eq(net, &init_net)) + kfree(tbl); + return -ENOMEM; } + ip_vs_start_estimator(net, &ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + return 0; +} -err_out: - genl_unregister_family(&ip_vs_genl_family); - return ret; +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } -static void ip_vs_genl_unregister(void) +#else + +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } + +#endif + +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + +int __net_init ip_vs_control_net_init(struct net *net) { - genl_unregister_family(&ip_vs_genl_family); + int i, idx; + struct netns_ipvs *ipvs = net_ipvs(net); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, + (unsigned long) net); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + + /* procfs stats */ + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats.cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ipvs_tot_stats; + ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); + u64_stats_init(&ipvs_tot_stats->syncp); + } + + spin_lock_init(&ipvs->tot_stats.lock); + + proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, net->proc_net, + &ip_vs_stats_percpu_fops); + + if (ip_vs_control_net_init_sysctl(net)) + goto err; + + return 0; + +err: + free_percpu(ipvs->tot_stats.cpustats); + return -ENOMEM; } -/* End of Generic Netlink interface definitions */ +void __net_exit ip_vs_control_net_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + ip_vs_trash_cleanup(net); + ip_vs_control_net_cleanup_sysctl(net); + remove_proc_entry("ip_vs_stats_percpu", net->proc_net); + remove_proc_entry("ip_vs_stats", net->proc_net); + remove_proc_entry("ip_vs", net->proc_net); + free_percpu(ipvs->tot_stats.cpustats); +} -int __init ip_vs_control_init(void) +int __init ip_vs_register_nl_ioctl(void) { int ret; - int idx; - - EnterFunction(2); ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { - IP_VS_ERR("cannot register sockopt.\n"); - return ret; + pr_err("cannot register sockopt.\n"); + goto err_sock; } ret = ip_vs_genl_register(); if (ret) { - IP_VS_ERR("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - return ret; + pr_err("cannot register Generic Netlink interface.\n"); + goto err_genl; } + return 0; - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: + return ret; +} - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - ip_vs_new_estimator(&ip_vs_stats); + smp_wmb(); /* Do we really need it now ? */ - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; LeaveFunction(2); return 0; @@ -3418,14 +3904,6 @@ int __init ip_vs_control_init(void) void ip_vs_control_cleanup(void) { EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); + unregister_netdevice_notifier(&ip_vs_dst_notifier); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index a9dac74bb13..c3b84546ea9 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -35,7 +35,11 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/ip.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> @@ -47,7 +51,7 @@ * IPVS DH bucket */ struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -60,11 +64,15 @@ struct ip_vs_dh_bucket { #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS DH entry */ -static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -81,10 +89,9 @@ static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } @@ -92,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); p = p->next; } @@ -123,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -141,52 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - } - svc->sched_data = tbl; + + svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); return 0; } -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; + struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + ip_vs_dh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; } -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + ip_vs_dh_reassign(s, svc); return 0; } @@ -206,27 +214,26 @@ static inline int is_overloaded(struct ip_vs_dest *dest) * Destination hashing scheduling */ static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + struct ip_vs_dh_state *s; - IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph->daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); @@ -245,7 +252,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; @@ -259,6 +267,7 @@ static int __init ip_vs_dh_init(void) static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 2eb2860dabb..1425e9a924c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -8,12 +8,19 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: - * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/kernel.h> #include <linux/jiffies.h> -#include <linux/slab.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> @@ -45,11 +52,44 @@ */ -static void estimation_timer(unsigned long arg); +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, + struct ip_vs_cpu_stats __percpu *stats) +{ + int i; + bool add = false; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + __u64 inbytes, outbytes; + if (add) { + sum->conns += s->ustats.conns; + sum->inpkts += s->ustats.inpkts; + sum->outpkts += s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + inbytes = s->ustats.inbytes; + outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + add = true; + sum->conns = s->ustats.conns; + sum->inpkts = s->ustats.inpkts; + sum->outpkts = s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + sum->inbytes = s->ustats.inbytes; + sum->outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + } + } +} -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); static void estimation_timer(unsigned long arg) { @@ -59,12 +99,16 @@ static void estimation_timer(unsigned long arg) u32 n_inpkts, n_outpkts; u64 n_inbytes, n_outbytes; u32 rate; + struct net *net = (struct net *)arg; + struct netns_ipvs *ipvs; - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { + ipvs = net_ipvs(net); + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); n_conns = s->ustats.conns; n_inpkts = s->ustats.inpkts; n_outpkts = s->ustats.outpkts; @@ -72,81 +116,64 @@ static void estimation_timer(unsigned long arg) n_outbytes = s->ustats.outbytes; /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; + rate = (n_conns - e->last_conns) << 9; e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->ustats.cps = (e->cps+0x1FF)>>10; + e->cps += ((long)rate - (long)e->cps) >> 2; - rate = (n_inpkts - e->last_inpkts)<<9; + rate = (n_inpkts - e->last_inpkts) << 9; e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->ustats.inpps = (e->inpps+0x1FF)>>10; + e->inpps += ((long)rate - (long)e->inpps) >> 2; - rate = (n_outpkts - e->last_outpkts)<<9; + rate = (n_outpkts - e->last_outpkts) << 9; e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->ustats.outpps = (e->outpps+0x1FF)>>10; + e->outpps += ((long)rate - (long)e->outpps) >> 2; - rate = (n_inbytes - e->last_inbytes)<<4; + rate = (n_inbytes - e->last_inbytes) << 4; e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->ustats.inbps = (e->inbps+0xF)>>5; + e->inbps += ((long)rate - (long)e->inbps) >> 2; - rate = (n_outbytes - e->last_outbytes)<<4; + rate = (n_outbytes - e->last_outbytes) << 4; e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->ustats.outbps = (e->outbps+0xF)>>5; + e->outbps += ((long)rate - (long)e->outbps) >> 2; spin_unlock(&s->lock); } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); + spin_unlock(&ipvs->est_lock); + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_new_estimator(struct ip_vs_stats *stats) +void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); - est->last_conns = stats->ustats.conns; - est->cps = stats->ustats.cps<<10; - - est->last_inpkts = stats->ustats.inpkts; - est->inpps = stats->ustats.inpps<<10; - - est->last_outpkts = stats->ustats.outpkts; - est->outpps = stats->ustats.outpps<<10; - - est->last_inbytes = stats->ustats.inbytes; - est->inbps = stats->ustats.inbps<<5; - - est->last_outbytes = stats->ustats.outbytes; - est->outbps = stats->ustats.outbps<<5; - - spin_lock_bh(&est_lock); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_kill_estimator(struct ip_vs_stats *stats) +void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; - spin_lock_bh(&est_lock); + spin_lock_bh(&ipvs->est_lock); list_del(&est->list); - spin_unlock_bh(&est_lock); + spin_unlock_bh(&ipvs->est_lock); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; - - /* set counters zero, caller must hold the stats->lock lock */ - est->last_inbytes = 0; - est->last_outbytes = 0; - est->last_conns = 0; - est->last_inpkts = 0; - est->last_outpkts = 0; + struct ip_vs_stats_user *u = &stats->ustats; + + /* reset counters, caller must hold the stats->lock lock */ + est->last_inbytes = u->inbytes; + est->last_outbytes = u->outbytes; + est->last_conns = u->conns; + est->last_inpkts = u->inpkts; + est->last_outpkts = u->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; @@ -154,13 +181,31 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) est->outbps = 0; } -int __init ip_vs_estimator_init(void) +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_stats_user *dst, + struct ip_vs_stats *stats) { - mod_timer(&est_timer, jiffies + 2 * HZ); + struct ip_vs_estimator *e = &stats->est; + + dst->cps = (e->cps + 0x1FF) >> 10; + dst->inpps = (e->inpps + 0x1FF) >> 10; + dst->outpps = (e->outpps + 0x1FF) >> 10; + dst->inbps = (e->inbps + 0xF) >> 5; + dst->outbps = (e->outbps + 0xF) >> 5; +} + +int __net_init ip_vs_estimator_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } -void ip_vs_estimator_cleanup(void) +void __net_exit ip_vs_estimator_net_cleanup(struct net *net) { - del_timer_sync(&est_timer); + del_timer_sync(&net_ipvs(net)->est_timer); } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 428edbf481c..77c173282f3 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -22,6 +22,9 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> @@ -29,6 +32,11 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> #include <asm/unaligned.h> @@ -36,16 +44,17 @@ #include <net/ip_vs.h> -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " +#define SERVER_STRING "227 " +#define CLIENT_STRING "PORT" /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. */ +static unsigned int ports_count = 1; static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); +module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); @@ -56,6 +65,8 @@ static int ip_vs_ftp_pasv; static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; return 0; } @@ -69,14 +80,17 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) /* * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. + * with the "pattern", ignoring before "skip" and terminated with + * the "term" character. * <addr,port> is in network order. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, + const char *pattern, size_t plen, + char skip, char term, __be32 *addr, __be16 *port, char **start, char **end) { + char *s, c; unsigned char p[6]; int i = 0; @@ -91,19 +105,38 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (strnicmp(data, pattern, plen) != 0) { return 0; } - *start = data + plen; + s = data + plen; + if (skip) { + int found = 0; + + for (;; s++) { + if (s == data_limit) + return -1; + if (!found) { + if (*s == skip) + found = 1; + } else if (*s != skip) { + break; + } + } + } - for (data = *start; *data != term; data++) { + for (data = s; ; data++) { if (data == data_limit) return -1; + if (*data == term) + break; } *end = data; memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { + for (data = s; ; data++) { + c = *data; + if (c == term) + break; + if (c >= '0' && c <= '9') { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { i++; } else { /* unexpected character */ @@ -114,12 +147,12 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (i != 5) return -1; - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); + *start = s; + *addr = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); return 1; } - /* * Look at outgoing ftp packets to catch the response to a PASV command * from the server (inside-to-outside). @@ -144,8 +177,11 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; - int ret; + unsigned int buf_len; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct net *net; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -173,7 +209,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', + sizeof(SERVER_STRING)-1, + '(', ')', &from.ip, &port, &start, &end) != 1) return 1; @@ -184,15 +221,22 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Now update or create an connection entry for it */ - n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, - &cp->caddr, 0); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &cp->caddr, 0, - &cp->vaddr, port, - &from, port, - IP_VS_CONN_F_NO_CPORT, - cp->dest); + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), + AF_INET, IPPROTO_TCP, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); if (!n_cp) return 0; @@ -205,26 +249,48 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ from.ip = n_cp->vaddr.ip; port = n_cp->vport; - sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), - (ntohs(port)>>8)&255, ntohs(port)&255); + snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&from.ip)[0], + ((unsigned char *)&from.ip)[1], + ((unsigned char *)&from.ip)[2], + ((unsigned char *)&from.ip)[3], + ntohs(port) >> 8, + ntohs(port) & 0xFF); + buf_len = strlen(buf); + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) { + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + rcu_read_lock(); + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + iph->ihl * 4, + start-data, end-start, + buf, buf_len); + rcu_read_unlock(); + if (ret) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } + } + /* - * Calculate required delta-offset to keep TCP happy + * Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } + net = skb_net(skb); cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); + ip_vs_tcp_conn_listen(net, n_cp); ip_vs_conn_put(n_cp); return ret; } @@ -253,6 +319,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; + struct net *net; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -306,7 +373,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to.ip, &port, + ' ', '\r', &to.ip, &port, &start, &end) != 1) return 1; @@ -322,27 +389,30 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_proto_name(iph->protocol), &to.ip, ntohs(port), &cp->vaddr.ip, 0); - n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1), - &cp->daddr, htons(ntohs(cp->dport)-1), - 0, - cp->dest); - if (!n_cp) - return 0; + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); + if (!n_cp) + return 0; - /* add its controller */ - ip_vs_control_add(n_cp, cp); + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } } /* * Move tunnel to listen state */ - ip_vs_tcp_conn_listen(n_cp); + net = skb_net(skb); + ip_vs_tcp_conn_listen(net, n_cp); ip_vs_conn_put(n_cp); return 1; @@ -363,42 +433,66 @@ static struct ip_vs_app ip_vs_ftp = { .pkt_in = ip_vs_ftp_in, }; - /* - * ip_vs_ftp initialization + * per netns ip_vs_ftp initialization */ -static int __init ip_vs_ftp_init(void) +static int __net_init __ip_vs_ftp_init(struct net *net) { int i, ret; - struct ip_vs_app *app = &ip_vs_ftp; + struct ip_vs_app *app; + struct netns_ipvs *ipvs = net_ipvs(net); - ret = register_ip_vs_app(app); - if (ret) - return ret; + if (!ipvs) + return -ENOENT; + + app = register_ip_vs_app(net, &ip_vs_ftp); + if (IS_ERR(app)) + return PTR_ERR(app); - for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { + for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; - ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); + ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); if (ret) - break; - IP_VS_INFO("%s: loaded support on port[%d] = %d\n", - app->name, i, ports[i]); + goto err_unreg; + pr_info("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); } + return 0; - if (ret) - unregister_ip_vs_app(app); - +err_unreg: + unregister_ip_vs_app(net, &ip_vs_ftp); return ret; } +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + unregister_ip_vs_app(net, &ip_vs_ftp); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; +static int __init ip_vs_ftp_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns on error */ + return rv; +} /* * ip_vs_ftp finish. */ static void __exit ip_vs_ftp_exit(void) { - unregister_ip_vs_app(&ip_vs_ftp); + unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 3eb5e2660c4..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -11,7 +11,7 @@ * Changes: * Martin Hamilton : fixed the terrible locking bugs * *lock(tbl->lock) ==> *lock(&tbl->lock) - * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : fixed the uninitialized tbl->lock bug * Wensong Zhang : added doing full expiration check to * collect stale entries of 24+ hours when * no partial expire check in a half hour @@ -39,7 +39,11 @@ * me to write this module. */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/ip.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> @@ -59,6 +63,8 @@ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) +#define DEFAULT_EXPIRATION (24*60*60*HZ) + /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) @@ -66,7 +72,6 @@ * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; /* @@ -85,11 +90,12 @@ static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; * IP address and its destination server */ struct ip_vs_lblc_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -97,48 +103,53 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; /* * IPVS LBLC sysctl table */ - -static ctl_table vs_vars_table[] = { +#ifdef CONFIG_SYSCTL +static struct ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { .ctl_name = 0 } + { } }; +#endif -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +static void ip_vs_lblc_rcu_free(struct rcu_head *head) { - list_del(&en->list); - /* - * We don't kfree dest because it is refered either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); + struct ip_vs_lblc_entry *en = container_of(head, + struct ip_vs_lblc_entry, + rcu_head); + + ip_vs_dest_put_and_free(en->dest); kfree(en); } +static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) +{ + hlist_del_rcu(&en->list); + call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); +} /* * Returns hash value for IPVS LBLC entry */ -static inline unsigned +static inline unsigned int ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -159,25 +170,22 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { - unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ +/* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblc_hashkey(af, addr); + unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -187,7 +195,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. + * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, @@ -196,26 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, struct ip_vs_lblc_entry *en; en = ip_vs_lblc_get(dest->af, tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } + if (en) { + if (en->dest == dest) + return en; + ip_vs_lblc_del(en); + } + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); - en->lastuse = jiffies; + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; - atomic_inc(&dest->refcnt); - en->dest = dest; + ip_vs_dest_hold(dest); + en->dest = dest; - ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } + ip_vs_lblc_hash(tbl, en); return en; } @@ -224,40 +229,56 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +static void ip_vs_lblc_flush(struct ip_vs_service *svc) { - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; int i; - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblc_free(en); + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } } + spin_unlock_bh(&svc->sched_lock); } +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblc_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; unsigned long now = jiffies; int i, j; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) + en->lastuse + + sysctl_lblc_expiration(svc))) continue; - ip_vs_lblc_free(en); + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -281,7 +302,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -299,26 +321,26 @@ static void ip_vs_lblc_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; - ip_vs_lblc_free(en); + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } @@ -330,11 +352,10 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) return -ENOMEM; - } + svc->sched_data = tbl; IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(*tbl)); @@ -342,12 +363,13 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -360,7 +382,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; @@ -368,14 +390,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); + ip_vs_lblc_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -386,12 +406,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) int loh, doh; /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: + * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! @@ -402,13 +417,12 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -418,14 +432,13 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -453,7 +466,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -468,20 +481,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_iphdr iph; struct ip_vs_dest *dest = NULL; struct ip_vs_lblc_entry *en; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); + en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); if (en) { /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; @@ -495,30 +505,28 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * free up entries from the trash at any time. */ - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; + dest = en->dest; + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); if (!dest) { - IP_VS_ERR_RL("LBLC: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); return dest; @@ -528,8 +536,7 @@ out: /* * IPVS LBLC Scheduler structure */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, @@ -539,23 +546,85 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .schedule = ip_vs_lblc_schedule, }; +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblc_ctl_table[0].procname = NULL; + + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; static int __init ip_vs_lblc_init(void) { int ret; - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) - unregister_sysctl_table(sysctl_header); + unregister_pernet_subsys(&ip_vs_lblc_ops); return ret; } - static void __exit ip_vs_lblc_cleanup(void) { - unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); + rcu_barrier(); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c04ce56c7f0..3f21a2f47de 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -37,11 +37,16 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> +#include <linux/list.h> +#include <linux/slab.h> /* for sysctl */ #include <linux/fs.h> @@ -58,6 +63,8 @@ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) +#define DEFAULT_EXPIRATION (24*60*60*HZ) + /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) @@ -65,8 +72,6 @@ * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - /* * for IPVS lblcr entry hash table @@ -82,104 +87,96 @@ static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; /* * IPVS destination set structure and operations */ -struct ip_vs_dest_list { - struct ip_vs_dest_list *next; /* list link */ - struct ip_vs_dest *dest; /* destination server */ +struct ip_vs_dest_set_elem { + struct list_head list; /* list link */ + struct ip_vs_dest *dest; /* destination server */ + struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ - struct ip_vs_dest_list *list; /* destination list */ - rwlock_t lock; /* lock for this list */ + struct list_head list; /* destination list */ }; -static struct ip_vs_dest_list * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) { - struct ip_vs_dest_list *e; + struct ip_vs_dest_set_elem *e; - for (e=set->list; e!=NULL; e=e->next) { - if (e->dest == dest) - /* already existed */ - return NULL; + if (check) { + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + return; + } } e = kmalloc(sizeof(*e), GFP_ATOMIC); - if (e == NULL) { - IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); - return NULL; - } + if (e == NULL) + return; - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); e->dest = dest; - /* link it to the list */ - e->next = set->list; - set->list = e; + list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; - return e; +} + +static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_set_elem *e; + + e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); + ip_vs_dest_put_and_free(e->dest); + kfree(e); } static void ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) { - struct ip_vs_dest_list *e, **ep; + struct ip_vs_dest_set_elem *e; - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + list_for_each_entry(e, &set->list, list) { if (e->dest == dest) { /* HIT */ - *ep = e->next; atomic_dec(&set->size); set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - kfree(e); + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); break; } - ep = &e->next; } } static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - *ep = e->next; - /* - * We don't kfree dest because it is refered either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - kfree(e); + struct ip_vs_dest_set_elem *e, *ep; + + list_for_each_entry_safe(e, ep, &set->list, list) { + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); } - write_unlock(&set->lock); } /* get weighted least-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) { - register struct ip_vs_dest_list *e; + register struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *least; int loh, doh; - if (set == NULL) - return NULL; - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { + list_for_each_entry_rcu(e, &set->list, list) { least = e->dest; if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; if ((atomic_read(&least->weight) > 0) && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -187,23 +184,23 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* find the destination with the weighted least load */ nextstage: - for (e=e->next; e!=NULL; e=e->next) { + list_for_each_entry_continue_rcu(e, &set->list, list) { dest = e->dest; if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) + doh = ip_vs_dest_conn_overhead(dest); + if (((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { least = dest; loh = doh; } } - IP_VS_DBG_BUF(6, "ip_vs_dest_set_min: server %s:%d " + IP_VS_DBG_BUF(6, "%s(): server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), @@ -216,7 +213,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* get weighted most-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) { - register struct ip_vs_dest_list *e; + register struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *most; int moh, doh; @@ -224,11 +221,10 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) return NULL; /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { + list_for_each_entry(e, &set->list, list) { most = e->dest; if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); + moh = ip_vs_dest_conn_overhead(most); goto nextstage; } } @@ -236,21 +232,21 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* find the destination with the weighted most load */ nextstage: - for (e=e->next; e!=NULL; e=e->next) { + list_for_each_entry_continue(e, &set->list, list) { dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); + doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) + if (((__s64)moh * atomic_read(&dest->weight) < + (__s64)doh * atomic_read(&most->weight)) && (atomic_read(&dest->weight) > 0)) { most = dest; moh = doh; } } - IP_VS_DBG_BUF(6, "ip_vs_dest_set_max: server %s:%d " + IP_VS_DBG_BUF(6, "%s(): server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), atomic_read(&most->activeconns), atomic_read(&most->refcnt), @@ -264,11 +260,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) * IP address and its destination server set */ struct ip_vs_lblcr_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -276,44 +273,46 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; +#ifdef CONFIG_SYSCTL /* * IPVS LBLCR sysctl table */ -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { .ctl_name = 0 } + { } }; - -static struct ctl_table_header * sysctl_header; +#endif static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { - list_del(&en->list); + hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); - kfree(en); + kfree_rcu(en, rcu_head); } /* * Returns hash value for IPVS LBLCR entry */ -static inline unsigned +static inline unsigned int ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -334,25 +333,22 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { - unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblcr_hashkey(af, addr); + unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -362,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. + * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, @@ -373,26 +369,24 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, en = ip_vs_lblcr_get(dest->af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + if (!en) return NULL; - } en->af = dest->af; ip_vs_addr_copy(dest->af, &en->addr, daddr); en->lastuse = jiffies; - /* initilize its dest set */ + /* initialize its dest set */ atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); + INIT_LIST_HEAD(&en->set.list); + + ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); + return en; } - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest, true); return en; } @@ -401,40 +395,54 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - /* No locking required, only called during cleanup. */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } + spin_unlock_bh(&svc->sched_lock); } +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblcr_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_after(en->lastuse + + sysctl_lblcr_expiration(svc), now)) continue; ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -458,7 +466,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -476,11 +485,11 @@ static void ip_vs_lblcr_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -488,7 +497,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -506,11 +515,10 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) return -ENOMEM; - } + svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(*tbl)); @@ -518,12 +526,13 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -536,7 +545,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; @@ -544,14 +553,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); + ip_vs_lblcr_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -562,12 +569,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) int loh, doh; /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: + * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! @@ -578,14 +580,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -595,14 +596,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -630,7 +630,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -645,65 +645,56 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_lblcr_table *tbl = svc->sched_data; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest = NULL; + struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); + en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); if (en) { - /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* Get the least loaded destination */ - read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + - sysctl_ip_vs_lblcr_expiration)) { - struct ip_vs_dest *m; - - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); + time_after(jiffies, en->set.lastmod + + sysctl_lblcr_expiration(svc))) { + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; + + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); + if (dest && !is_overloaded(dest, svc)) goto out; - } /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { - IP_VS_ERR_RL("LBLCR: no destination available\n"); - read_unlock(&svc->sched_lock); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); goto out; + } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); @@ -713,13 +704,14 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); return dest; @@ -740,23 +732,84 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .schedule = ip_vs_lblcr_schedule, }; +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblcr_ctl_table[0].procname = NULL; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; static int __init ip_vs_lblcr_init(void) { int ret; - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) - unregister_sysctl_table(sysctl_header); + unregister_pernet_subsys(&ip_vs_lblcr_ops); return ret; } - static void __exit ip_vs_lblcr_cleanup(void) { - unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); + rcu_barrier(); } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index d0dadc8a65f..2bdcb1cf212 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -14,37 +14,25 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - /* * Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; unsigned int loh = 0, doh; - IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* * Simply select the server with the least number of @@ -55,11 +43,11 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * served, but no new connection is assigned to the server. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->weight) == 0) continue; - doh = ip_vs_lc_dest_overhead(dest); + doh = ip_vs_dest_conn_overhead(dest); if (!least || doh < loh) { least = dest; loh = doh; @@ -67,7 +55,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (!least) - IP_VS_ERR_RL("LC: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); else IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " "inactconns %d\n", @@ -97,6 +85,7 @@ static int __init ip_vs_lc_init(void) static void __exit ip_vs_lc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); } module_init(ip_vs_lc_init); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 00000000000..5882bbfd198 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,299 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * + * Authors: + * Ben North <ben@redfrontdoor.org> + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels + * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* Applications may adjust TCP seqs */ + if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && + !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(&p); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 694952db502..961a6de9bb2 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -31,13 +31,16 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> -static inline unsigned int +static inline int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { /* @@ -52,12 +55,13 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; + int loh = 0, doh; - IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* * We calculate the load of each dest server as follows: @@ -72,7 +76,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD || !atomic_read(&dest->weight)) @@ -88,15 +92,15 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { + ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight))) { least = dest; loh = doh; } } if (!least) { - IP_VS_ERR_RL("NQ: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } @@ -130,6 +134,7 @@ static int __init ip_vs_nq_init(void) static void __exit ip_vs_nq_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); } module_init(ip_vs_nq_init); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 00000000000..1a82b29ce8e --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,111 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/string.h> +#include <linux/kmod.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); + +/* Get pe in the pe list by name */ +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + rcu_read_unlock(); + return pe; + } + if (pe->module) + module_put(pe->module); + } + rcu_read_unlock(); + + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = __ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = __ip_vs_pe_getbyname(name); + } + + return pe; +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&ip_vs_pe_mutex); + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + mutex_unlock(&ip_vs_pe_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + mutex_lock(&ip_vs_pe_mutex); + /* Remove it from the d-linked pe list */ + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c new file mode 100644 index 00000000000..bed5f704252 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -0,0 +1,171 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +#ifdef CONFIG_IP_VS_DEBUG +static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, + const char *callid, size_t callid_len, + int *idx) +{ + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); + memcpy(buf + *idx, callid, len); + buf[*idx+len] = '\0'; + *idx += len + 1; + return buf + *idx - len; +} + +#define IP_VS_DEBUG_CALLID(callid, len) \ + ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ + callid, len, &ip_vs_dbg_idx) +#endif + +static int get_callid(const char *dptr, unsigned int dataoff, + unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen) +{ + /* Find callid */ + while (1) { + int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen, + SIP_HDR_CALL_ID, matchoff, + matchlen); + if (ret > 0) + break; + if (!ret) + return -EINVAL; + dataoff += *matchoff; + } + + /* Too large is useless */ + if (*matchlen > IP_VS_PEDATA_MAXLEN) + return -EINVAL; + + /* SIP headers are always followed by a line terminator */ + if (*matchoff + *matchlen == datalen) + return -EINVAL; + + /* RFC 2543 allows lines to be terminated with CR, LF or CRLF, + * RFC 3261 allows only CRLF, we support both. */ + if (*(dptr + *matchoff + *matchlen) != '\r' && + *(dptr + *matchoff + *matchlen) != '\n') + return -EINVAL; + + IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n", + IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen), + *matchlen); + return 0; +} + +static int +ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) +{ + struct ip_vs_iphdr iph; + unsigned int dataoff, datalen, matchoff, matchlen; + const char *dptr; + int retc; + + ip_vs_fill_iph_skb(p->af, skb, &iph); + + /* Only useful with UDP */ + if (iph.protocol != IPPROTO_UDP) + return -EINVAL; + /* todo: IPv6 fragments: + * I think this only should be done for the first fragment. /HS + */ + dataoff = iph.len + sizeof(struct udphdr); + + if (dataoff >= skb->len) + return -EINVAL; + retc = skb_linearize(skb); + if (retc < 0) + return retc; + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + + if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) + return -EINVAL; + + /* N.B: pe_data is only set on success, + * this allows fallback to the default persistence logic on failure + */ + p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + + p->pe_data_len = matchlen; + + return 0; +} + +static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct) + +{ + bool ret = false; + + if (ct->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && + /* protocol should only be IPPROTO_IP if + * d_addr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + p->vaddr, &ct->vaddr) && + ct->vport == p->vport && + ct->flags & IP_VS_CONN_F_TEMPLATE && + ct->protocol == p->protocol && + ct->pe_data && ct->pe_data_len == p->pe_data_len && + !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) + ret = true; + + IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, + u32 initval, bool inverse) +{ + return jhash(p->pe_data, p->pe_data_len, initval); +} + +static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) +{ + memcpy(buf, cp->pe_data, cp->pe_data_len); + return cp->pe_data_len; +} + +static struct ip_vs_pe ip_vs_sip_pe = +{ + .name = "sip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list), + .fill_param = ip_vs_sip_fill_param, + .ct_match = ip_vs_sip_ct_match, + .hashkey_raw = ip_vs_sip_hashkey_raw, + .show_pe_data = ip_vs_sip_show_pe_data, +}; + +static int __init ip_vs_sip_init(void) +{ + return register_ip_vs_pe(&ip_vs_sip_pe); +} + +static void __exit ip_vs_sip_cleanup(void) +{ + unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); +} + +module_init(ip_vs_sip_init); +module_exit(ip_vs_sip_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index a01520e3d6b..939f7fbe9b4 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -13,15 +13,18 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/gfp.h> #include <linux/in.h> #include <linux/ip.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> -#include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> @@ -45,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; @@ -56,6 +59,37 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); + + if (!pd) + return -ENOMEM; + + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } + + return 0; +} /* * unregister an ipvs protocol @@ -63,7 +97,7 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { @@ -78,6 +112,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) return -ESRCH; } +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data **pd_p; + unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(net, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} /* * get ip_vs_protocol object by its proto. @@ -85,7 +142,7 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) @@ -94,20 +151,46 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) return NULL; } +EXPORT_SYMBOL(ip_vs_proto_get); + +/* + * get ip_vs_protocol object data by netns and proto + */ +static struct ip_vs_proto_data * +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ + struct ip_vs_proto_data *pd; + unsigned int hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + return __ipvs_proto_data_get(ipvs, proto); +} +EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ -void ip_vs_protocol_timeout_change(int flags) +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { - struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); } } } @@ -116,7 +199,7 @@ void ip_vs_protocol_timeout_change(int flags) int * ip_vs_create_timeout_table(int *table, int size) { - return kmemdup(table, size, GFP_ATOMIC); + return kmemdup(table, size, GFP_KERNEL); } @@ -124,7 +207,8 @@ ip_vs_create_timeout_table(int *table, int size) * Set timeout value for state specified by name */ int -ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +ip_vs_set_state_timeout(int *table, int num, const char *const *names, + const char *name, int to) { int i; @@ -162,26 +246,24 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); + sprintf(buf, "TRUNCATED"); else if (ih->frag_off & htons(IP_OFFSET)) - sprintf(buf, "%s %pI4->%pI4 frag", - pp->name, &ih->saddr, &ih->daddr); + sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { - __be16 _ports[2], *pptr -; + __be16 _ports[2], *pptr; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %pI4->%pI4", - pp->name, &ih->saddr, &ih->daddr); + sprintf(buf, "TRUNCATED %pI4->%pI4", + &ih->saddr, &ih->daddr); else - sprintf(buf, "%s %pI4:%u->%pI4:%u", - pp->name, + sprintf(buf, "%pI4:%u->%pI4:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); + pr_debug("%s: %s %s\n", msg, pp->name, buf); } #ifdef CONFIG_IP_VS_IPV6 @@ -196,44 +278,90 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); + sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) - sprintf(buf, "%s %pI6->%pI6 frag", - pp->name, &ih->saddr, &ih->daddr); + sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %pI6->%pI6", - pp->name, &ih->saddr, &ih->daddr); + sprintf(buf, "TRUNCATED %pI6c->%pI6c", + &ih->saddr, &ih->daddr); else - sprintf(buf, "%s %pI6:%u->%pI6:%u", - pp->name, + sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); + pr_debug("%s: %s %s\n", msg, pp->name, buf); } #endif void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) + if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } +/* + * per network name-space init + */ +int __net_init ip_vs_protocol_net_init(struct net *net) +{ + int i, ret; + static struct ip_vs_protocol *protos[] = { +#ifdef CONFIG_IP_VS_PROTO_TCP + &ip_vs_protocol_tcp, +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + &ip_vs_protocol_udp, +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + &ip_vs_protocol_sctp, +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + &ip_vs_protocol_ah, +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + &ip_vs_protocol_esp, +#endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } + return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; +} + +void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(net, pd); + } +} int __init ip_vs_protocol_init(void) { @@ -253,13 +381,16 @@ int __init ip_vs_protocol_init(void) #ifdef CONFIG_IP_VS_PROTO_UDP REGISTER_PROTOCOL(&ip_vs_protocol_udp); #endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + REGISTER_PROTOCOL(&ip_vs_protocol_sctp); +#endif #ifdef CONFIG_IP_VS_PROTO_AH REGISTER_PROTOCOL(&ip_vs_protocol_ah); #endif #ifdef CONFIG_IP_VS_PROTO_ESP REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif - IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + pr_info("Registered protocols (%s)\n", &protocols[2]); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 79f56c1e7c1..5de3dd312c0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -10,6 +10,9 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/in.h> #include <linux/ip.h> #include <linux/module.h> @@ -37,28 +40,32 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 +static void +ah_esp_conn_fill_param_proto(struct net *net, int af, + const struct ip_vs_iphdr *iph, int inverse, + struct ip_vs_conn_param *p) +{ + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, +ah_esp_conn_in_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_in_get(&p); if (!cp) { /* * We are not sure if the packet is from our @@ -67,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", inverse ? "ICMP+" : "", - pp->name, + ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } @@ -78,32 +85,19 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, static struct ip_vs_conn * ah_esp_conn_out_get(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", inverse ? "ICMP+" : "", - pp->name, + ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } @@ -113,8 +107,9 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { /* * AH/ESP is only related traffic. Pass the packet to IP stack. @@ -123,76 +118,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %pI4->%pI4", - pp->name, &ih->saddr, &ih->daddr); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %pI6->%pI6", - pp->name, &ih->saddr, &ih->daddr); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - #ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, + .init = NULL, + .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, @@ -203,9 +136,8 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, }; #endif @@ -215,8 +147,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .protocol = IPPROTO_ESP, .num_states = 1, .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, + .init = NULL, + .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, @@ -227,7 +159,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c new file mode 100644 index 00000000000..2f7ea756404 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -0,0 +1,591 @@ +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <net/ip.h> +#include <net/ip6_checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/sctp/checksum.h> +#include <net/ip_vs.h> + +static int +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct net *net; + struct ip_vs_service *svc; + struct netns_ipvs *ipvs; + sctp_chunkhdr_t _schunkh, *sch; + sctp_sctphdr_t *sh, _sctph; + + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh == NULL) { + *verdict = NF_DROP; + return 0; + } + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch == NULL) { + *verdict = NF_DROP; + return 0; + } + + net = skb_net(skb); + ipvs = net_ipvs(net); + rcu_read_lock(); + if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, sh->dest))) { + int ignored; + + if (ip_vs_todrop(ipvs)) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + rcu_read_unlock(); + *verdict = NF_DROP; + return 0; + } + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + /* NF_ACCEPT */ + return 1; +} + +static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph, + unsigned int sctphoff) +{ + sctph->checksum = sctp_compute_cksum(skb, sctphoff); + skb->ip_summed = CHECKSUM_UNNECESSARY; +} + +static int +sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff = iph->len; + bool payload_csum = false; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + ret = ip_vs_app_pkt_out(cp, skb); + if (ret == 0) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + + /* Only update csum if we really have to */ + if (sctph->source != cp->vport || payload_csum || + skb->ip_summed == CHECKSUM_PARTIAL) { + sctph->source = cp->vport; + sctp_nat_csum(skb, sctph, sctphoff); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + return 1; +} + +static int +sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff = iph->len; + bool payload_csum = false; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + ret = ip_vs_app_pkt_in(cp, skb); + if (ret == 0) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + + /* Only update csum if we really have to */ + if (sctph->dest != cp->dport || payload_csum || + (skb->ip_summed == CHECKSUM_PARTIAL && + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { + sctph->dest = cp->dport; + sctp_nat_csum(skb, sctph, sctphoff); + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + return 1; +} + +static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int sctphoff; + struct sctphdr *sh, _sctph; + __le32 cmp, val; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + cmp = sh->checksum; + val = sctp_compute_cksum(skb, sctphoff); + + if (val != cmp) { + /* CRC failure, dump it. */ + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + return 1; +} + +enum ipvs_sctp_event_t { + IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ + IP_VS_SCTP_INIT, + IP_VS_SCTP_INIT_ACK, + IP_VS_SCTP_COOKIE_ECHO, + IP_VS_SCTP_COOKIE_ACK, + IP_VS_SCTP_SHUTDOWN, + IP_VS_SCTP_SHUTDOWN_ACK, + IP_VS_SCTP_SHUTDOWN_COMPLETE, + IP_VS_SCTP_ERROR, + IP_VS_SCTP_ABORT, + IP_VS_SCTP_EVENT_LAST +}; + +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { + [SCTP_CID_DATA] = IP_VS_SCTP_DATA, + [SCTP_CID_INIT] = IP_VS_SCTP_INIT, + [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, + [SCTP_CID_SACK] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, + [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, + [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, + [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, + [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, + [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, + [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, + [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, + [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, + [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, +}; + +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ + +#define sNO IP_VS_SCTP_S_NONE +#define sI1 IP_VS_SCTP_S_INIT1 +#define sIN IP_VS_SCTP_S_INIT +#define sCS IP_VS_SCTP_S_COOKIE_SENT +#define sCR IP_VS_SCTP_S_COOKIE_REPLIED +#define sCW IP_VS_SCTP_S_COOKIE_WAIT +#define sCO IP_VS_SCTP_S_COOKIE +#define sCE IP_VS_SCTP_S_COOKIE_ECHOED +#define sES IP_VS_SCTP_S_ESTABLISHED +#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT +#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ IP_VS_SCTP_S_REJECTED +#define sCL IP_VS_SCTP_S_CLOSED + +static const __u8 sctp_states + [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { + { /* INPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* OUTPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, +/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, +/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* INPUT-ONLY */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, +}; + +#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) + +/* Timeout table[state] */ +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_LAST] = 2 * HZ, +}; + +static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT1] = "INIT1", + [IP_VS_SCTP_S_INIT] = "INIT", + [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", + [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", + [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", + [IP_VS_SCTP_S_COOKIE] = "COOKIE", + [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", + [IP_VS_SCTP_S_REJECTED] = "REJECTED", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!", +}; + + +static const char *sctp_state_name(int state) +{ + if (state >= IP_VS_SCTP_S_LAST) + return "ERR!"; + if (sctp_state_name_table[state]) + return sctp_state_name_table[state]; + return "?"; +} + +static inline void +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, const struct sk_buff *skb) +{ + sctp_chunkhdr_t _sctpch, *sch; + unsigned char chunk_type; + int event, next_state; + int ihl, cofs; + +#ifdef CONFIG_IP_VS_IPV6 + ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + ihl = ip_hdrlen(skb); +#endif + + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); + if (sch == NULL) + return; + + chunk_type = sch->type; + /* + * Section 3: Multiple chunks can be bundled into one SCTP packet + * up to the MTU size, except for the INIT, INIT ACK, and + * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with + * any other chunk in a packet. + * + * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control + * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be + * bundled with an ABORT, but they MUST be placed before the ABORT + * in the SCTP packet or they will be ignored by the receiver. + */ + if ((sch->type == SCTP_CID_COOKIE_ECHO) || + (sch->type == SCTP_CID_COOKIE_ACK)) { + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) + chunk_type = sch->type; + } + } + + event = (chunk_type < sizeof(sctp_events)) ? + sctp_events[chunk_type] : IP_VS_SCTP_DATA; + + /* Update direction to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (direction == IP_VS_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + direction = IP_VS_DIR_INPUT_ONLY; + } + + next_state = sctp_states[direction][event][cp->state]; + + if (next_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((direction == IP_VS_DIR_OUTPUT) ? + "output " : "input "), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + sctp_state_name(cp->state), + sctp_state_name(next_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state == IP_VS_SCTP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; +} + +static void +sctp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, struct ip_vs_proto_data *pd) +{ + spin_lock_bh(&cp->lock); + set_sctp_state(pd, cp, direction, skb); + spin_unlock_bh(&cp->lock); +} + +static inline __u16 sctp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) + & SCTP_APP_TAB_MASK; +} + +static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + hash = sctp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->appcnt); +out: + + return ret; +} + +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + +static int sctp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + /* Lookup application incarnations and bind the right one */ + hash = sctp_app_hashkey(cp->vport); + + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + rcu_read_unlock(); +out: + return result; +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + +struct ip_vs_protocol ip_vs_protocol_sctp = { + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, + .unregister_app = sctp_unregister_app, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .csum_check = sctp_csum_check, + .state_name = sctp_state_name, + .state_transition = sctp_state_transition, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 8cba4180285..e3a697234a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -9,10 +9,17 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ @@ -24,77 +31,36 @@ #include <net/ip_vs.h> - -static struct ip_vs_conn * -tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - - static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { + struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + struct netns_ipvs *ipvs; - th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } - - if (th->syn && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, - th->dest))) { - if (ip_vs_todrop()) { + net = skb_net(skb); + ipvs = net_ipvs(net); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + rcu_read_lock(); + if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, th->dest))) { + int ignored; + + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -103,13 +69,18 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); + /* NF_ACCEPT */ return 1; } @@ -144,32 +115,31 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); + csum_unfold(tcph->check)))); else #endif tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); + csum_unfold(tcph->check)))); } static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; - unsigned int tcphoff; + unsigned int tcphoff = iph->len; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - tcphoff = ip_hdrlen(skb); oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ @@ -177,13 +147,20 @@ tcp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -194,12 +171,13 @@ tcp_snat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; @@ -217,6 +195,7 @@ tcp_snat_handler(struct sk_buff *skb, skb->len - tcphoff, cp->protocol, skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, @@ -227,19 +206,18 @@ tcp_snat_handler(struct sk_buff *skb, static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; - unsigned int tcphoff; + unsigned int tcphoff = iph->len; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - tcphoff = ip_hdrlen(skb); oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ @@ -247,6 +225,8 @@ tcp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -255,8 +235,13 @@ tcp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -266,15 +251,16 @@ tcp_dnat_handler(struct sk_buff *skb, * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; @@ -321,7 +307,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -332,7 +318,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -359,7 +345,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = { /* * Timeout table[state] */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, @@ -374,7 +360,7 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = 2*HZ, }; -static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { +static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", @@ -417,7 +403,7 @@ static struct tcp_states_t tcp_states [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ @@ -431,7 +417,7 @@ static struct tcp_states_t tcp_states [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; @@ -440,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ @@ -454,14 +440,11 @@ static struct tcp_states_t tcp_states_dos [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ @@ -471,14 +454,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) @@ -495,7 +471,7 @@ static inline int tcp_state_idx(struct tcphdr *th) } static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; @@ -518,7 +494,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, goto tcp_state_out; } - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { @@ -526,7 +503,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, + pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', @@ -556,17 +533,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } - cp->timeout = pp->timeout_table[cp->state = new_state]; + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; } - /* * Handle state transitions */ -static int +static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; @@ -578,26 +557,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) - return 0; - - spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); - spin_unlock(&cp->lock); + return; - return 1; + spin_lock_bh(&cp->lock); + set_tcp_state(pd, cp, direction, th); + spin_unlock_bh(&cp->lock); } - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) @@ -605,44 +571,45 @@ static inline __u16 tcp_app_hashkey(__be16 port) } -static int tcp_register_app(struct ip_vs_app *inc) +static int tcp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); hash = tcp_app_hashkey(port); - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&tcp_app_lock); return ret; } static void -tcp_unregister_app(struct ip_vs_app *inc) +tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; @@ -654,14 +621,14 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&tcp_app_lock); + rcu_read_unlock(); - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), @@ -676,7 +643,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&tcp_app_lock); + rcu_read_unlock(); out: return result; @@ -686,24 +653,37 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { - spin_lock(&cp->lock); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; - spin_unlock(&cp->lock); + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); + spin_unlock_bh(&cp->lock); } - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + pd->tcp_state_table = tcp_states; + return 0; } - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) { + kfree(pd->timeout_table); } @@ -712,14 +692,15 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .csum_check = tcp_csum_check, @@ -728,5 +709,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, }; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index d2930a71084..b62a3c0ff9b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,10 +9,14 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/in.h> #include <linux/ip.h> #include <linux/kernel.h> @@ -24,83 +28,34 @@ #include <net/ip.h> #include <net/ip6_checksum.h> -static struct ip_vs_conn * -udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { + struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; - struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } - - svc = ip_vs_service_get(af, skb->mark, iph.protocol, - &iph.daddr, uh->dest); + net = skb_net(skb); + rcu_read_lock(); + svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, uh->dest); if (svc) { - if (ip_vs_todrop()) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -109,13 +64,18 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); + /* NF_ACCEPT */ return 1; } @@ -151,32 +111,31 @@ udp_partial_csum_update(int af, struct udphdr *uhdr, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); + csum_unfold(uhdr->check)))); else #endif uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); + csum_unfold(uhdr->check)))); } static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; - unsigned int udphoff; + unsigned int udphoff = iph->len; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - udphoff = ip_hdrlen(skb); oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ @@ -184,6 +143,8 @@ udp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -191,8 +152,13 @@ udp_snat_handler(struct sk_buff *skb, /* * Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -205,12 +171,13 @@ udp_snat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; @@ -230,6 +197,7 @@ udp_snat_handler(struct sk_buff *skb, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, udph->check, (char*)&(udph->check) - (char*)udph); @@ -239,19 +207,18 @@ udp_snat_handler(struct sk_buff *skb, static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; - unsigned int udphoff; + unsigned int udphoff = iph->len; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - udphoff = ip_hdrlen(skb); oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ @@ -259,6 +226,8 @@ udp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -267,8 +236,13 @@ udp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -278,15 +252,16 @@ udp_dnat_handler(struct sk_buff *skb, * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; @@ -342,7 +317,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -353,7 +328,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -366,19 +341,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 1; } - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - static inline __u16 udp_app_hashkey(__be16 port) { return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) @@ -386,44 +348,44 @@ static inline __u16 udp_app_hashkey(__be16 port) } -static int udp_register_app(struct ip_vs_app *inc) +static int udp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); hash = udp_app_hashkey(port); - - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&udp_app_lock); return ret; } static void -udp_unregister_app(struct ip_vs_app *inc) +udp_unregister_app(struct net *net, struct ip_vs_app *inc) { - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); } static int udp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; @@ -435,14 +397,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&udp_app_lock); + rcu_read_unlock(); - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), @@ -457,31 +419,23 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&udp_app_lock); + rcu_read_unlock(); out: return result; } -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = 5*60*HZ, [IP_VS_UDP_S_LAST] = 2*HZ, }; -static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { +static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = "UDP", [IP_VS_UDP_S_LAST] = "BUG!", }; - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - static const char * udp_state_name(int state) { if (state >= IP_VS_UDP_S_LAST) @@ -489,23 +443,34 @@ static const char * udp_state_name(int state) return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; } -static int +static void udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static void udp_init(struct ip_vs_protocol *pp) +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } -static void udp_exit(struct ip_vs_protocol *pp) +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) { + kfree(pd->timeout_table); } @@ -514,11 +479,13 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .protocol = IPPROTO_UDP, .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = udp_snat_handler, .dnat_handler = udp_dnat_handler, .csum_check = udp_csum_check, @@ -529,5 +496,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .app_conn_bind = udp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, }; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 2d16ab7f8c1..176b87c35e3 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -19,6 +19,9 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> @@ -32,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) { - svc->sched_data = &svc->destinations; + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -43,38 +55,44 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc) * Round-Robin Scheduling */ static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { - struct list_head *p, *q; - struct ip_vs_dest *dest; + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; - IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); - IP_VS_ERR_RL("RR: no destination available\n"); + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), @@ -91,7 +109,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, .schedule = ip_vs_rr_schedule, }; @@ -103,6 +122,7 @@ static int __init ip_vs_rr_init(void) static void __exit ip_vs_rr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); } module_init(ip_vs_rr_init); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index a46ad9e3501..4dbcda6258b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -17,6 +17,9 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -26,13 +29,14 @@ #include <net/ip_vs.h> +EXPORT_SYMBOL(ip_vs_scheduler_err); /* * IPVS scheduler list */ static LIST_HEAD(ip_vs_schedulers); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); /* @@ -43,25 +47,14 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, { int ret; - if (svc == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - if (scheduler == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); - return -EINVAL; - } - - svc->scheduler = scheduler; - if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { - IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + pr_err("%s(): init error\n", __func__); return ret; } } - + rcu_assign_pointer(svc->scheduler, scheduler); return 0; } @@ -69,30 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, /* * Unbind a service with its scheduler */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) { - struct ip_vs_scheduler *sched; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); - return -EINVAL; - } + struct ip_vs_scheduler *cur_sched; - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); - return -EINVAL; - } - } + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; - svc->scheduler = NULL; - return 0; + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can not be set to NULL */ } @@ -103,10 +85,9 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) { struct ip_vs_scheduler *sched; - IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", - sched_name); + IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - read_lock_bh(&__ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -120,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return sched; } if (sched->module) module_put(sched->module); } - read_unlock_bh(&__ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return NULL; } @@ -157,10 +138,34 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { - if (scheduler->module) + if (scheduler && scheduler->module) module_put(scheduler->module); } +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ + struct ip_vs_scheduler *sched; + + sched = rcu_dereference(svc->scheduler); + if (svc->fwmark) { + IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", + sched->name, svc->fwmark, svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 + } else if (svc->af == AF_INET6) { + IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", + sched->name, ip_vs_proto_name(svc->protocol), + &svc->addr.in6, ntohs(svc->port), msg); +#endif + } else { + IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", + sched->name, ip_vs_proto_name(svc->protocol), + &svc->addr.ip, ntohs(svc->port), msg); + } +} /* * Register a scheduler in the scheduler list @@ -170,25 +175,25 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) struct ip_vs_scheduler *sched; if (!scheduler) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + pr_err("%s(): NULL arg\n", __func__); return -EINVAL; } if (!scheduler->name) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + pr_err("%s(): NULL scheduler_name\n", __func__); return -EINVAL; } /* increase the module use count */ ip_vs_use_count_inc(); - write_lock_bh(&__ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already linked\n", scheduler->name); + pr_err("%s(): [%s] scheduler already linked\n", + __func__, scheduler->name); return -EINVAL; } @@ -198,11 +203,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", - scheduler->name); + pr_err("%s(): [%s] scheduler already existed " + "in the system\n", __func__, scheduler->name); return -EINVAL; } } @@ -210,9 +214,9 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); - IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + pr_info("[%s] scheduler registered.\n", scheduler->name); return 0; } @@ -224,15 +228,15 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) { if (!scheduler) { - IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + pr_err("%s(): NULL arg\n", __func__); return -EINVAL; } - write_lock_bh(&__ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " - "is not in the list. failed\n", scheduler->name); + mutex_unlock(&ip_vs_sched_mutex); + pr_err("%s(): [%s] scheduler is not in the list. failed\n", + __func__, scheduler->name); return -EINVAL; } @@ -240,12 +244,12 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); - IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + pr_info("[%s] scheduler unregistered.\n", scheduler->name); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 20e4657d2f3..e446b9fa742 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -35,13 +35,16 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> -static inline unsigned int +static inline int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { /* @@ -56,12 +59,13 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; - IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* * We calculate the load of each dest server as follows: @@ -76,7 +80,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -84,19 +88,19 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) goto nextstage; } } - IP_VS_ERR_RL("SED: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; /* * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -131,6 +135,7 @@ static int __init ip_vs_sed_init(void) static void __exit ip_vs_sed_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); } module_init(ip_vs_sed_init); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 75709ebeb63..cc65b2f42cd 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -30,21 +30,34 @@ * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * + * The weight destination attribute can be used to control the + * distribution of connections to the destinations in servernode. The + * greater the weight, the more connections the destination + * will receive. + * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/ip.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/ip_vs.h> +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> + /* * IPVS SH bucket */ struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -57,11 +70,24 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) +struct ip_vs_sh_state { + struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; +}; + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} /* * Returns hash value for IPVS SH entry */ -static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; @@ -70,7 +96,8 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; } @@ -78,38 +105,102 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) { - return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; } +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* first try the dest it's supposed to go to */ + ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + dest = rcu_dereference(s->buckets[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + /* if the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; + hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, "SH: selected unavailable " + "server %s:%d (offset %d), reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + /* * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + int d_count; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); + d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); + + IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", + i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + atomic_read(&dest->weight)); + + /* Don't move to next dest until filling weight */ + if (++d_count >= atomic_read(&dest->weight)) { + p = p->next; + d_count = 0; + } - p = p->next; } b++; } @@ -120,16 +211,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -138,64 +231,84 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - } - svc->sched_data = tbl; + + svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); return 0; } -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; + struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + ip_vs_sh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; } -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + ip_vs_sh_reassign(s, svc); return 0; } -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - return dest->flags & IP_VS_DEST_F_OVERLOAD; + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (unlikely(th == NULL)) + return 0; + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (unlikely(uh == NULL)) + return 0; + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(sh == NULL)) + return 0; + port = sh->source; + break; + default: + port = 0; + } + + return port; } @@ -203,28 +316,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest) * Source Hashing scheduling */ static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + struct ip_vs_sh_state *s; + __be16 port = 0; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - IP_VS_ERR_RL("SH: no destination available\n"); + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + + s = (struct ip_vs_sh_state *) svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + else + dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), + IP_VS_DBG_ADDR(svc->af, &iph->saddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); @@ -243,7 +360,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; @@ -257,6 +376,7 @@ static int __init ip_vs_sh_init(void) static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 6be5d4efa51..db801263ee9 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -5,6 +5,18 @@ * high-performance and highly available server based on a * cluster of servers. * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * ip_vs_sync: sync connection info from master load balancer to backups @@ -15,8 +27,13 @@ * Alexandre Cassen : Added SyncID support for incoming sync * messages filtering. * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/slab.h> #include <linux/inetdevice.h> @@ -32,6 +49,8 @@ #include <linux/wait.h> #include <linux/kernel.h> +#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ + #include <net/ip.h> #include <net/sock.h> @@ -40,11 +59,14 @@ #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ #define IP_VS_SYNC_PORT 8848 /* multicast port */ +#define SYNC_PROTO_VER 1 /* Protocol version in header */ +static struct lock_class_key __ipvs_sync_key; /* * IPVS sync connection entry + * Version 0, i.e. original version. */ -struct ip_vs_sync_conn { +struct ip_vs_sync_conn_v0 { __u8 reserved; /* Protocol, addresses and port numbers */ @@ -68,51 +90,177 @@ struct ip_vs_sync_conn_options { struct ip_vs_seq out_seq; /* outgoing seq. struct */ }; +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + struct ip_vs_sync_thread_data { + struct net *net; struct socket *sock; char *buf; + int id; }; -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) #define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) /* - The master mulitcasts messages to the backup load balancers in the - following format. + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | IPVS Sync Connection (1) | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | . | - | . | + ~ . ~ | . | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | IPVS Sync Connection (n) | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | */ #define SYNC_MESG_HEADER_LEN 4 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ -struct ip_vs_sync_mesg { +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; - __u16 size; + __be16 size; /* ip_vs_sync_conn entries start here */ }; -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; +/* Version 1 header */ +struct ip_vs_sync_mesg { + __u8 reserved; /* must be zero */ + __u8 syncid; + __be16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; struct ip_vs_sync_buff { struct list_head list; @@ -124,70 +272,75 @@ struct ip_vs_sync_buff { unsigned char *end; }; +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = __constant_htons(IP_VS_SYNC_PORT), - .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), -}; - +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} -static inline struct ip_vs_sync_buff *sb_dequeue(void) +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) { struct ip_vs_sync_buff *sb; - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ms->sync_queue)) { sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); } else { - sb = list_entry(ip_vs_sync_queue.next, - struct ip_vs_sync_buff, + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; } - spin_unlock_bh(&ip_vs_sync_lock); + spin_unlock_bh(&ipvs->sync_lock); return sb; } -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { kfree(sb); return NULL; } + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ + sb->mesg->version = SYNC_PROTO_VER; + sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); + sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; return sb; } @@ -198,14 +351,24 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) { - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); - else + struct ip_vs_sync_buff *sb = ms->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) + wake_up_process(ms->master_thread); + } else ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); + spin_unlock(&ipvs->sync_lock); } /* @@ -213,47 +376,209 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) { struct ip_vs_sync_buff *sb; - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; + spin_lock_bh(&ipvs->sync_buff_lock); + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); } else sb = NULL; - spin_unlock_bh(&curr_sb_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + return sb; +} + +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) +{ + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ipvs->master_syncid; + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; return sb; } +/* Check if connection is controlled by persistence */ +static inline bool in_persistence(struct ip_vs_conn *cp) +{ + for (cp = cp->control; cp; cp = cp->control) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + return true; + } + return false; +} + +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) + return 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | + (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | + (1 << IP_VS_SCTP_S_CLOSED)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (sync_refresh_period <= 0 && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} /* + * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, + int pkts) { - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; int len; - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); - IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (!m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; } len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; /* copy members */ + s->reserved = 0; s->protocol = cp->protocol; s->cport = cp->cport; s->vport = cp->vport; @@ -270,73 +595,364 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) } m->nr_conns++; - m->size += len; - curr_sb->head += len; + m->size = htons(ntohs(m->size) + len); + buff->head += len; /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; + if (buff->head + FULL_CONN_SIZE > buff->end) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; } - spin_unlock(&curr_sb_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(net, cp->control, pkts); + } } - /* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + * Sending Version 1 messages */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) { - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; - char *p; - int i; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m; + union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Handle old version of the protocol */ + if (sysctl_sync_ver(ipvs) == 0) { + ip_vs_sync_conn_v0(net, cp, pkts); + return; + } + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); return; } - /* Convert size back to host byte order */ - m->size = ntohs(m->size); + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + pad = 0; + } } - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + ms->sync_buff = buff; + m = buff->mesg; + } + + p = buff->head; + buff->head += pad + len; + m->size = htons(ntohs(m->size) + pad + len); + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; + + s = (union ip_vs_sync_conn *)p; + + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); + m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + s->v6.caddr = cp->caddr.in6; + s->v6.vaddr = cp->vaddr.in6; + s->v6.daddr = cp->daddr.in6; + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; + } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + + spin_unlock_bh(&ipvs->sync_buff_lock); + +control: + /* synchronize its controller if it has */ + cp = cp->control; + if (!cp) return; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + goto sloop; +} + +/* + * fill_param used by version 1 + */ +static inline int +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(net, af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(net, af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + if (p->pe->module) + module_put(p->pe->module); + return -ENOMEM; + } + p->pe_data_len = pe_data_len; + } + return 0; +} + +/* + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. + */ +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); + + if (cp) { + /* Free pe_data */ + kfree(param->pe_data); + + dest = cp->dest; + spin_lock_bh(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); + } else { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + rcu_read_lock(); + dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark, flags); + + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + rcu_read_unlock(); + if (!cp) { + if (param->pe_data) + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(net, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); } + ip_vs_conn_put(cp); +} - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct net *net, const char *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i; + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); for (i=0; i<m->nr_conns; i++) { - unsigned flags, state; + unsigned int flags, state; if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); return; } - s = (struct ip_vs_sync_conn *) p; + s = (struct ip_vs_sync_conn_v0 *) p; flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; flags &= ~IP_VS_CONN_F_HASHED; if (flags & IP_VS_CONN_F_SEQ_MASK) { opt = (struct ip_vs_sync_conn_options *)&s[1]; p += FULL_CONN_SIZE; if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); return; } } else { @@ -348,112 +964,311 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) if (!(flags & IP_VS_CONN_F_TEMPLATE)) { pp = ip_vs_proto_get(s->protocol); if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", s->protocol); continue; } if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", pp->name, state); continue; } } else { /* protocol in templates is not used for state/timeout */ - pp = NULL; if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", state); state = 0; } } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - else - cp = ip_vs_ct_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; + ip_vs_conn_fill_param(net, AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; + } + } + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", + state); + state = 0; + } + } + if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#endif + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct net *net, __u8 *buffer, + const size_t buflen) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; + __u8 *p, *msg_end; + int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + + if (buflen != ntohs(m2->size)) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); + nr_conns = m2->nr_conns; + + for (i=0; i<nr_conns; i++) { + union ip_vs_sync_conn *s; + unsigned int size; + int retc; + + p = msg_end; + if (p + sizeof(s->v4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + return; } - cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - (union nf_inet_addr *)&s->daddr, - s->dport, - flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - IP_VS_ERR("ip_vs_conn_new failed\n"); + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; + } + /* Process a single sync_conn */ + retc = ip_vs_proc_sync_conn(net, p, msg_end); + if (retc < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); + } else { + /* Old type of message */ + ip_vs_process_message_v0(net, buffer, buflen); + return; } } /* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + sysctl_wmem_max); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + sysctl_rmem_max); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* * Setup loopback of outgoing multicasts on a sending socket */ static void set_mcast_loop(struct sock *sk, u_char loop) @@ -486,8 +1301,10 @@ static int set_mcast_if(struct sock *sk, char *ifname) { struct net_device *dev; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -506,30 +1323,33 @@ static int set_mcast_if(struct sock *sk, char *ifname) * Set the maximum length of sync message according to the * specified interface's MTU. */ -static int set_sync_mesg_maxlen(int sync_state) +static int set_sync_mesg_maxlen(struct net *net, int sync_state) { + struct netns_ipvs *ipvs = net_ipvs(net); struct net_device *dev; int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); + if (!dev) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr) - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); + "message %d.\n", ipvs->send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); + if (!dev) return -ENODEV; - sync_recv_mesg_maxlen = dev->mtu - + ipvs->recv_mesg_maxlen = dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr); IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); + "message %d.\n", ipvs->recv_mesg_maxlen); } return 0; @@ -544,6 +1364,7 @@ static int set_sync_mesg_maxlen(int sync_state) static int join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) { + struct net *net = sock_net(sk); struct ip_mreqn mreq; struct net_device *dev; int ret; @@ -551,7 +1372,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -568,17 +1390,19 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) static int bind_mcastif_addr(struct socket *sock, char *ifname) { + struct net *net = sock_net(sock->sk); struct net_device *dev; __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); if (!addr) - IP_VS_ERR("You probably need to specify IP address on " - "multicast interface.\n"); + pr_err("You probably need to specify IP address on " + "multicast interface.\n"); IP_VS_DBG(7, "binding socket with (%s) %pI4\n", ifname, &addr); @@ -594,44 +1418,59 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket * make_send_sock(void) +static struct socket *make_send_sock(struct net *net, int id) { + struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; - /* First create a socket */ + /* First create a socket move it to right name space later */ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); + pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { - IP_VS_ERR("Error setting outbound mcast interface\n"); + pr_err("Error setting outbound mcast interface\n"); goto error; } set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { - IP_VS_ERR("Error binding address of the mcast interface\n"); + pr_err("Error binding address of the mcast interface\n"); goto error; } result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, sizeof(struct sockaddr), 0); if (result < 0) { - IP_VS_ERR("Error connecting to the multicast addr\n"); + pr_err("Error connecting to the multicast addr\n"); goto error; } return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -639,41 +1478,56 @@ static struct socket * make_send_sock(void) /* * Set up receiving multicast socket over UDP */ -static struct socket * make_receive_sock(void) +static struct socket *make_receive_sock(struct net *net, int id) { + struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; /* First create a socket */ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); + pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; + sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, sizeof(struct sockaddr)); if (result < 0) { - IP_VS_ERR("Error binding to the multicast addr\n"); + pr_err("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ result = join_mcast_group(sock->sk, (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); + ipvs->backup_mcast_ifn); if (result < 0) { - IP_VS_ERR("Error joining to the multicast group\n"); + pr_err("Error joining to the multicast group\n"); goto error; } return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -695,18 +1549,19 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) return len; } -static void +static int ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) { int msize; + int ret; - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); + msize = ntohs(msg->size); - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - IP_VS_ERR("ip_vs_send_async error\n"); + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; } static int @@ -722,53 +1577,95 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) iov.iov_base = buffer; iov.iov_len = (size_t)buflen; - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); if (len < 0) - return -1; + return len; LeaveFunction(7); return len; } +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ms->master_thread); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); - - while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d, id = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; } - - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + /* (Ab)use interruptible sleep to avoid increasing + * the load avg. + */ + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop()); + if (unlikely(kthread_should_stop())) + goto done; } - - schedule_timeout_interruptible(HZ); + ip_vs_sync_buff_release(sb); } +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { + while ((sb = sb_dequeue(ipvs, ms))) ip_vs_sync_buff_release(sb); - } + __set_current_state(TASK_RUNNING); /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { + sb = get_curr_sync_buff(ipvs, ms, 0); + if (sb) ip_vs_sync_buff_release(sb); - } /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo); return 0; @@ -778,36 +1675,34 @@ static int sync_thread_master(void *data) static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); int len; - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d, id = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); while (!kthread_should_stop()) { - wait_event_interruptible(*tinfo->sock->sk->sk_sleep, + wait_event_interruptible(*sk_sleep(tinfo->sock->sk), !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) || kthread_should_stop()); /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); + ipvs->recv_mesg_maxlen); if (len <= 0) { - IP_VS_ERR("receiving message error\n"); + if (len != -EAGAIN) + pr_err("receiving message error\n"); break; } - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); - local_bh_enable(); + ip_vs_process_message(tinfo->net, tinfo->buf, len); } } /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo->buf); kfree(tinfo); @@ -815,128 +1710,239 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; + struct task_struct **array = NULL, *task; struct socket *sock; - char *name, *buf = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); + char *name; int (*threadfn)(void *data); + int id, count; int result = -ENOMEM; - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); + sizeof(struct ip_vs_sync_conn_v0)); + + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) + if (ipvs->ms) return -EEXIST; - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; + strlcpy(ipvs->master_mcast_ifn, mcast_ifn, + sizeof(ipvs->master_mcast_ifn)); + ipvs->master_syncid = syncid; + name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; - sock = make_send_sock(); } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) + if (ipvs->backup_threads) return -EEXIST; - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; + strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, + sizeof(ipvs->backup_mcast_ifn)); + ipvs->backup_syncid = syncid; + name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(); } else { return -EINVAL; } - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; + + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } else { + array = kzalloc(count * sizeof(struct task_struct *), + GFP_KERNEL); + if (!array) + goto out; } + set_sync_mesg_maxlen(net, state); - set_sync_mesg_maxlen(state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); - if (!buf) + tinfo = NULL; + for (id = 0; id < count; id++) { + if (state == IP_VS_STATE_MASTER) + sock = make_send_sock(net, id); + else + sock = make_receive_sock(net, id); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto outtinfo; + } + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) goto outsocket; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->sock = sock; - tinfo->buf = buf; + tinfo->net = net; + tinfo->sock = sock; + if (state == IP_VS_STATE_BACKUP) { + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto outtinfo; + } else { + tinfo->buf = NULL; + } + tinfo->id = id; - task = kthread_run(threadfn, tinfo, name); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + tinfo = NULL; + if (state == IP_VS_STATE_MASTER) + ipvs->ms[id].master_thread = task; + else + array[id] = task; } /* mark as active */ - *realtask = task; - ip_vs_sync_state |= state; + + if (state == IP_VS_STATE_BACKUP) + ipvs->backup_threads = array; + spin_lock_bh(&ipvs->sync_buff_lock); + ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); /* increase the module use count */ ip_vs_use_count_inc(); return 0; -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); outsocket: - sock_release(sock); + sk_release_kernel(sock->sk); + +outtinfo: + if (tinfo) { + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + } + count = id; + while (count-- > 0) { + if (state == IP_VS_STATE_MASTER) + kthread_stop(ipvs->ms[count].master_thread); + else + kthread_stop(array[count]); + } + kfree(array); + out: + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } return result; } -int stop_sync_thread(int state) +int stop_sync_thread(struct net *net, int state) { - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + struct netns_ipvs *ipvs = net_ipvs(net); + struct task_struct **array; + int id; + int retc = -EINVAL; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) + if (!ipvs->ms) return -ESRCH; - IP_VS_INFO("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); - /* * The lock synchronizes with sb_queue_tail(), so that we don't * add sync buffers to the queue, when we are already in * progress of stopping the master sync daemon. */ - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ms->master_thread)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(ms->master_thread); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) + if (!ipvs->backup_threads) return -ESRCH; - IP_VS_INFO("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); - - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; - } else { - return -EINVAL; + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + array = ipvs->backup_threads; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(array[id])); + ret = kthread_stop(array[id]); + if (retc >= 0) + retc = ret; + } + kfree(array); + ipvs->backup_threads = NULL; } /* decrease the module use count */ ip_vs_use_count_dec(); + return retc; +} + +/* + * Initialize data struct for each netns + */ +int __net_init ip_vs_sync_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); return 0; } + +void ip_vs_sync_net_cleanup(struct net *net) +{ + int retc; + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); + mutex_unlock(&ipvs->sync_mutex); +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 8e942565b47..b5b4650d50a 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -19,35 +19,23 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - /* * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); @@ -64,27 +52,27 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; - loh = ip_vs_wlc_dest_overhead(least); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } - IP_VS_ERR_RL("WLC: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; /* * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = ip_vs_wlc_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -119,6 +107,7 @@ static int __init ip_vs_wlc_init(void) static void __exit ip_vs_wlc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); } module_init(ip_vs_wlc_init); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index f7d74ef1ecf..0546cd572d6 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -18,37 +18,59 @@ * */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/module.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/net.h> +#include <linux/gcd.h> #include <net/ip_vs.h> +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + /* * current destination pointer for weighted round-robin scheduling */ struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ + struct ip_vs_dest *cl; /* current dest or head */ int cw; /* current weight */ int mw; /* maximum weight */ int di; /* decreasing interval */ + struct rcu_head rcu_head; }; -/* - * Get the gcd of server weights - */ -static int gcd(int a, int b) -{ - int c; - - while ((c = a % b)) { - a = b; - b = c; - } - return b; -} - static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) { struct ip_vs_dest *dest; @@ -74,11 +96,12 @@ static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) { struct ip_vs_dest *dest; - int weight = 0; + int new_weight, weight = 0; list_for_each_entry(dest, &svc->destinations, n_list) { - if (atomic_read(&dest->weight) > weight) - weight = atomic_read(&dest->weight); + new_weight = atomic_read(&dest->weight); + if (new_weight > weight) + weight = new_weight; } return weight; @@ -92,41 +115,45 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) /* * Allocate the mark variable for WRR scheduling */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); + if (mark == NULL) return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); + + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; svc->sched_data = mark; return 0; } -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) { + struct ip_vs_wrr_mark *mark = svc->sched_data; + /* * Release the mark variable */ - kfree(svc->sched_data); - - return 0; + kfree_rcu(mark, rcu_head); } -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { struct ip_vs_wrr_mark *mark = svc->sched_data; - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -135,80 +162,82 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) * Weighted Round-Robin Scheduling */ static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { - struct ip_vs_dest *dest; + struct ip_vs_dest *dest, *last, *stop = NULL; struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; + bool last_pass = false, restarted = false; - IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - IP_VS_ERR_RL("WRR: no destination available: " - "no destinations present\n"); - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - IP_VS_ERR_RL("WRR: no destination " - "available\n"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - IP_VS_ERR_RL("WRR: no destination available: " - "all destinations are overloaded\n"); - goto out; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; } } +found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + mark->cl = dest; out: - write_unlock(&svc->sched_lock); + spin_unlock_bh(&svc->sched_lock); return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; } @@ -219,7 +248,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, .schedule = ip_vs_wrr_schedule, }; @@ -231,6 +262,7 @@ static int __init ip_vs_wrr_init(void) static void __exit ip_vs_wrr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); } module_init(ip_vs_wrr_init); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 425ab144f15..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -11,9 +11,25 @@ * * Changes: * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL */ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ @@ -22,180 +38,495 @@ #include <net/route.h> /* for ip_route_output */ #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/addrconf.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> +enum { + IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ + IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ + IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to + * local + */ + IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ +}; + +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} /* * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) { - struct dst_entry *old_dst; + struct ip_vs_dest_dst *old; - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dst_release(old_dst); + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); + + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) { - struct dst_entry *dst = dest->dst_cache; + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; - if (!dst) + if (!dest_dst) return NULL; - if ((dst->obsolete - || (dest->af == AF_INET && rtos != dest->dst_rtos)) && - dst->ops->check(dst, cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; + return dest_dst; +} + +static inline bool +__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) +{ + if (IP6CB(skb)->frag_max_size) { + /* frag_max_size tell us that, this packet have been + * defragmented by netfilter IPv6 conntrack module. + */ + if (IP6CB(skb)->frag_max_size > mtu) + return true; /* largest fragment violate MTU */ } - dst_hold(dst); - return dst; + else if (skb->len > mtu && !skb_is_gso(skb)) { + return true; /* Packet size violate MTU size */ + } + return false; +} + +/* Get route to daddr, update *saddr, optionally bind route to saddr */ +static struct rtable *do_output_route4(struct net *net, __be32 daddr, + int rt_mode, __be32 *saddr) +{ + struct flowi4 fl4; + struct rtable *rt; + int loop = 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; + +retry: + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + /* Invalid saddr ? */ + if (PTR_ERR(rt) == -EINVAL && *saddr && + rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { + *saddr = 0; + flowi4_update_output(&fl4, 0, 0, daddr, 0); + goto retry; + } + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); + return NULL; + } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + ip_rt_put(rt); + *saddr = fl4.saddr; + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + loop++; + goto retry; + } + *saddr = fl4.saddr; + return rt; } -static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +/* Get route to destination or remote server */ +static int +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, int rt_mode, __be32 *ret_saddr) { + struct net *net = dev_net(skb_dst(skb)->dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rtable *ort; /* Original route */ + struct iphdr *iph; + __be16 df; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = dest->addr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &dest->addr.ip); - return NULL; + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", - &dest->addr.ip, - atomic_read(&rt->u.dst.__refcnt), rtos); + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); + if (!rt) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; + } + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); + } + daddr = dest->addr.ip; + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + } else { + __be32 saddr = htonl(INADDR_ANY); + + noref = 0; + + /* For such unconfigured boxes avoid many route lookups + * for performance reasons because we do not remember saddr + */ + rt_mode &= ~IP_VS_RT_MODE_CONNECT; + rt = do_output_route4(net, daddr, rt_mode, &saddr); + if (!rt) + goto err_unreach; + if (ret_saddr) + *ret_saddr = saddr; + } + + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &daddr); + goto err_put; + } + iph = ip_hdr(skb); + if (likely(!local)) { + if (unlikely(ipv4_is_loopback(iph->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI4 to non-local address, dest: %pI4\n", + &iph->saddr, &daddr); + goto err_put; } - spin_unlock(&dest->dst_lock); } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = cp->daddr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &cp->daddr.ip); - return NULL; + ort = skb_rtable(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !(ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to " + "local requires NAT method, dest: %pI4\n", + &iph->daddr, &daddr); + goto err_put; } + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; } - return rt; + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + df = iph->frag_off & htons(IP_DF); + } else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; + } + ort = skb_rtable(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + /* MTU check allowed? */ + df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; + } + + /* MTU checking */ + if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #ifdef CONFIG_IP_VS_IPV6 -static struct rt6_info * -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) + +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} + +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm) +{ + struct dst_entry *dst; + struct flowi6 fl6 = { + .daddr = *daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6.daddr, 0, &fl6.saddr) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + dst = NULL; + goto out_err; + } + } + *ret_saddr = fl6.saddr; + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + +/* + * Get route to destination or remote server + */ +static int +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { + struct net *net = dev_net(skb_dst(skb)->dev); + struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rt6_info *ort; /* Original route */ + struct dst_entry *dst; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); - if (!rt) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = dest->addr.in6, - .saddr = { - .s6_addr32 = - { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, - NULL, &fl); - if (!rt) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", - &dest->addr.in6); - return NULL; + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { + u32 cookie; + + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest_dst->dst_saddr.in6, + do_xfrm); + if (!dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", - &dest->addr.in6, - atomic_read(&rt->u.dst.__refcnt)); + rt = (struct rt6_info *) dst; + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest_dst->dst_saddr.in6, + atomic_read(&rt->dst.__refcnt)); } - spin_unlock(&dest->dst_lock); + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = cp->daddr.in6, - .saddr = { - .s6_addr32 = { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", - &cp->daddr.in6); - return NULL; + noref = 0; + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + if (!dst) + goto err_unreach; + rt = (struct rt6_info *) dst; + } + + local = __ip_vs_is_local_route6(rt); + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", + local ? "local":"non-local", daddr); + goto err_put; + } + if (likely(!local)) { + if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI6c to non-local address, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err_put; + } + } else { + ort = (struct rt6_info *) skb_dst(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !__ip_vs_is_local_route6(ort)) { + IP_VS_DBG_RL("Redirect from non-local address %pI6c " + "to local requires NAT method, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->daddr, daddr); + goto err_put; } + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; } - return rt; + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + ort = (struct rt6_info *) skb_dst(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + } + + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #endif -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) { - struct dst_entry *old_dst; + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset(skb); + skb_forward_csum(skb); + } + return ret; +} - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; } -#define IP_VS_XMIT(pf, skb, rt) \ -do { \ - (skb)->ipvs_property = 1; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ -} while (0) +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} /* @@ -203,10 +534,10 @@ do { \ */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } @@ -217,64 +548,31 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; EnterFunction(10); - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, dest: %pI4\n", - &iph->daddr); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + rcu_read_lock(); + if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, + NULL) < 0) goto tx_error; - } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; + ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(PF_INET, skb, rt); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -282,63 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - struct ipv6hdr *iph = ipv6_hdr(skb); - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - EnterFunction(10); - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, dest: %pI6\n", - &iph->daddr); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); + rcu_read_lock(); + if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(PF_INET6, skb, rt); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -350,152 +612,181 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); + int local, rc, was_input; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error; + } + } +#endif - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(PF_INET, skb, rt); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; - tx_error_icmp: - dst_link_failure(skb); tx_error: - LeaveFunction(10); kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; + int local, rc; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; - p = skb_header_pointer(skb, sizeof(struct ipv6hdr), - sizeof(_pt), &_pt); + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error; + } + } +#endif - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(PF_INET6, skb, rt); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; -tx_error_icmp: - dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); + rcu_read_unlock(); return NF_STOLEN; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; } #endif @@ -521,72 +812,57 @@ tx_error_put: */ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); struct rtable *rt; /* Route to the other host */ + __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; + __be16 df; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; + int ret, local; EnterFunction(10); - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - htons(ETH_P_IP), skb->protocol); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - if (mtu < 68) { - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); + rt = skb_rtable(skb); + tdev = rt->dst.dev; - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); - goto tx_error; - } + /* Copy DF, reset fragment offset and MF */ + df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); + + if (!new_skb) + goto tx_error; + consume_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); } - skb->transport_header = old_transport_header; + skb->transport_header = skb->network_header; /* fix old IP header checksum */ ip_send_check(old_iph); @@ -595,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - /* * Push down and install the IPIP header. */ @@ -608,24 +880,28 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = tos; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = cp->daddr.ip; + iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - ip_local_out(skb); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -633,78 +909,56 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct ipv6hdr *old_iph = ipv6_hdr(skb); - sk_buff_data_t old_transport_header = skb->transport_header; struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; + int ret, local; EnterFunction(10); - if (skb->protocol != htons(ETH_P_IPV6)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " - "ETH_P_IPV6: %d, skb protocol: %d\n", - htons(ETH_P_IPV6), skb->protocol); - goto tx_error; - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); - /* TODO IPv6: do we need this check in IPv6? */ - if (mtu < 1280) { - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); - goto tx_error; - } + rt = (struct rt6_info *) skb_dst(skb); + tdev = rt->dst.dev; /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->u.dst); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); + + if (!new_skb) + goto tx_error; + consume_skb(skb); skb = new_skb; old_iph = ipv6_hdr(skb); } - skb->transport_header = old_transport_header; + skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - /* * Push down and install the IPIP header. */ @@ -715,23 +969,27 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); iph->priority = old_iph->priority; memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - iph->daddr = rt->rt6i_dst.addr; - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + iph->daddr = cp->daddr.in6; + iph->saddr = saddr; iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - ip6_local_out(skb); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -744,52 +1002,38 @@ tx_error: */ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } ip_send_check(ip_hdr(skb)); - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(PF_INET, skb, rt); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -797,52 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; + int local; EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(PF_INET6, skb, rt); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -855,11 +1083,13 @@ tx_error: */ int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *iph) { struct rtable *rt; /* Route to the other host */ - int mtu; int rc; + int local; + int rt_mode, was_input; EnterFunction(10); @@ -868,7 +1098,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); + rc = cp->packet_xmit(skb, cp, pp, iph); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -879,61 +1109,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * mangle and send the packet here (only for VS/NAT) */ + was_input = rt_is_input_route(skb_rtable(skb)); + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) - goto tx_error_icmp; + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error; + } + } +#endif - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; ip_vs_nat_icmp(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); + skb->ignore_df = 1; - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); goto out; - tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; int rc; + int local; + int rt_mode; EnterFunction(10); @@ -942,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); + rc = cp->packet_xmit(skb, cp, pp, ipvsh); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -954,50 +1202,65 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error; + } + } +#endif - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; ip_vs_nat_icmp_v6(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); + skb->ignore_df = 1; - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); goto out; -tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; } #endif |
