From b98e1747eecc19b872572c5fffedc1868531dac6 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 5 Nov 2007 20:42:16 -0800 Subject: [NETFILTER]: Sort matches/targets in Kbuild file Sort matches and targets in the Kbuild file. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/Kbuild | 18 +++++++++--------- include/linux/netfilter_ipv4/Kbuild | 28 ++++++++++++++-------------- include/linux/netfilter_ipv6/Kbuild | 2 +- 3 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index f2eaea2234e..b87e83a5e07 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -4,25 +4,28 @@ header-y += nfnetlink_conntrack.h header-y += nfnetlink_log.h header-y += nfnetlink_queue.h header-y += xt_CLASSIFY.h +header-y += xt_CONNMARK.h +header-y += xt_CONNSECMARK.h +header-y += xt_DSCP.h +header-y += xt_MARK.h +header-y += xt_NFLOG.h +header-y += xt_NFQUEUE.h +header-y += xt_SECMARK.h +header-y += xt_TCPMSS.h header-y += xt_comment.h header-y += xt_connbytes.h header-y += xt_connmark.h -header-y += xt_CONNMARK.h header-y += xt_conntrack.h header-y += xt_dccp.h header-y += xt_dscp.h -header-y += xt_DSCP.h header-y += xt_esp.h -header-y += xt_helper.h header-y += xt_hashlimit.h +header-y += xt_helper.h header-y += xt_length.h header-y += xt_limit.h header-y += xt_mac.h header-y += xt_mark.h -header-y += xt_MARK.h header-y += xt_multiport.h -header-y += xt_NFQUEUE.h -header-y += xt_NFLOG.h header-y += xt_pkttype.h header-y += xt_policy.h header-y += xt_realm.h @@ -32,9 +35,6 @@ header-y += xt_statistic.h header-y += xt_string.h header-y += xt_tcpmss.h header-y += xt_tcpudp.h -header-y += xt_SECMARK.h -header-y += xt_CONNSECMARK.h -header-y += xt_TCPMSS.h unifdef-y += nf_conntrack_common.h unifdef-y += nf_conntrack_ftp.h diff --git a/include/linux/netfilter_ipv4/Kbuild b/include/linux/netfilter_ipv4/Kbuild index 7185792b900..3a7105bb8f3 100644 --- a/include/linux/netfilter_ipv4/Kbuild +++ b/include/linux/netfilter_ipv4/Kbuild @@ -1,47 +1,47 @@ -header-y += ipt_addrtype.h -header-y += ipt_ah.h header-y += ipt_CLASSIFY.h header-y += ipt_CLUSTERIP.h +header-y += ipt_CONNMARK.h +header-y += ipt_DSCP.h +header-y += ipt_ECN.h +header-y += ipt_LOG.h +header-y += ipt_MARK.h +header-y += ipt_NFQUEUE.h +header-y += ipt_REJECT.h +header-y += ipt_SAME.h +header-y += ipt_TCPMSS.h +header-y += ipt_TOS.h +header-y += ipt_TTL.h +header-y += ipt_ULOG.h +header-y += ipt_addrtype.h +header-y += ipt_ah.h header-y += ipt_comment.h header-y += ipt_connbytes.h header-y += ipt_connmark.h -header-y += ipt_CONNMARK.h header-y += ipt_conntrack.h header-y += ipt_dccp.h header-y += ipt_dscp.h -header-y += ipt_DSCP.h header-y += ipt_ecn.h -header-y += ipt_ECN.h header-y += ipt_esp.h header-y += ipt_hashlimit.h header-y += ipt_helper.h header-y += ipt_iprange.h header-y += ipt_length.h header-y += ipt_limit.h -header-y += ipt_LOG.h header-y += ipt_mac.h header-y += ipt_mark.h -header-y += ipt_MARK.h header-y += ipt_multiport.h -header-y += ipt_NFQUEUE.h header-y += ipt_owner.h header-y += ipt_physdev.h header-y += ipt_pkttype.h header-y += ipt_policy.h header-y += ipt_realm.h header-y += ipt_recent.h -header-y += ipt_REJECT.h -header-y += ipt_SAME.h header-y += ipt_sctp.h header-y += ipt_state.h header-y += ipt_string.h header-y += ipt_tcpmss.h -header-y += ipt_TCPMSS.h header-y += ipt_tos.h -header-y += ipt_TOS.h header-y += ipt_ttl.h -header-y += ipt_TTL.h -header-y += ipt_ULOG.h unifdef-y += ip_queue.h unifdef-y += ip_tables.h diff --git a/include/linux/netfilter_ipv6/Kbuild b/include/linux/netfilter_ipv6/Kbuild index 9dd978d149f..8887a5fcd1d 100644 --- a/include/linux/netfilter_ipv6/Kbuild +++ b/include/linux/netfilter_ipv6/Kbuild @@ -14,8 +14,8 @@ header-y += ip6t_mark.h header-y += ip6t_multiport.h header-y += ip6t_opts.h header-y += ip6t_owner.h -header-y += ip6t_policy.h header-y += ip6t_physdev.h +header-y += ip6t_policy.h header-y += ip6t_rt.h unifdef-y += ip6_tables.h -- cgit v1.2.3-70-g09d2 From 6a9fb9479f2672fa392711735de9e642395c9a14 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 5 Nov 2007 21:32:31 -0800 Subject: [IPV4]: Clean the ip_sockglue.c from some ugly ifdefs The #idfed CONFIG_IP_MROUTE is sometimes places inside the if-s, which looks completely bad. Similar ifdefs inside the functions looks a bit better, but they are also not recommended to be used. Provide an ifdef-ed ip_mroute_opt() helper to cleanup the code. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/mroute.h | 12 ++++++++++++ net/ipv4/ip_sockglue.c | 39 ++++++++++++--------------------------- 2 files changed, 24 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7da2cee8e13..35a8277ec1b 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -128,6 +128,18 @@ struct igmpmsg #ifdef __KERNEL__ #include +#ifdef CONFIG_IP_MROUTE +static inline int ip_mroute_opt(int opt) +{ + return (opt >= MRT_BASE) && (opt <= MRT_BASE + 10); +} +#else +static inline int ip_mroute_opt(int opt) +{ + return 0; +} +#endif + extern int ip_mroute_setsockopt(struct sock *, int, char __user *, int); extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f51f20e487c..82817e55436 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -437,10 +437,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* If optlen==0, it is equivalent to val == 0 */ -#ifdef CONFIG_IP_MROUTE - if (optname >= MRT_BASE && optname <= (MRT_BASE + 10)) + if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk,optname,optval,optlen); -#endif err = 0; lock_sock(sk); @@ -909,11 +907,9 @@ int ip_setsockopt(struct sock *sk, int level, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > (MRT_BASE + 10)) -#endif - ) { + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { lock_sock(sk); err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); release_sock(sk); @@ -935,11 +931,9 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > (MRT_BASE + 10)) -#endif - ) { + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { lock_sock(sk); err = compat_nf_setsockopt(sk, PF_INET, optname, optval, optlen); @@ -967,11 +961,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IP) return -EOPNOTSUPP; -#ifdef CONFIG_IP_MROUTE - if (optname >= MRT_BASE && optname <= MRT_BASE+10) { + if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk,optname,optval,optlen); - } -#endif if (get_user(len,optlen)) return -EFAULT; @@ -1171,11 +1162,8 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > MRT_BASE+10) -#endif - ) { + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { int len; if (get_user(len,optlen)) @@ -1200,11 +1188,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, int err = do_ip_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > MRT_BASE+10) -#endif - ) { + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { int len; if (get_user(len, optlen)) -- cgit v1.2.3-70-g09d2 From 286ab3d46058840d68e5d7d52e316c1f7e98c59f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Nov 2007 23:38:39 -0800 Subject: [NET]: Define infrastructure to keep 'inuse' changes in an efficent SMP/NUMA way. "struct proto" currently uses an array stats[NR_CPUS] to track change on 'inuse' sockets per protocol. If NR_CPUS is big, this means we use a big memory area for this. Moreover, all this memory area is located on a single node on NUMA machines, increasing memory pressure on the boot node. In this patch, I tried to : - Keep a fast !CONFIG_SMP implementation - Keep a fast CONFIG_SMP implementation for often used protocols (tcp,udp,raw,...) - Introduce a NUMA efficient implementation Some helper macros are defined in include/net/sock.h These macros take into account CONFIG_SMP If a "struct proto" is declared without using DEFINE_PROTO_INUSE / REF_PROTO_INUSE macros, it will automatically use a default implementation, using a dynamically allocated percpu zone. This default implementation will be NUMA efficient, but might use 32/64 bytes per possible cpu because of current alloc_percpu() implementation. However it still should be better than previous implementation based on stats[NR_CPUS] field. When a "struct proto" is changed to use the new macros, we use a single static "int" percpu variable, lowering the memory and cpu costs, still preserving NUMA efficiency. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++------ net/core/sock.c | 48 ++++++++++++++++++++++++++++++++++++++++- net/ipv4/proc.c | 19 ++++------------ net/ipv6/proc.c | 19 ++++------------ 4 files changed, 112 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 20de3fa7ae4..5504fb9fa88 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -560,6 +560,14 @@ struct proto { void (*unhash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_SMP + /* Keeping track of sockets in use */ + void (*inuse_add)(struct proto *prot, int inc); + int (*inuse_getval)(const struct proto *prot); + int *inuse_ptr; +#else + int inuse; +#endif /* Memory pressure */ void (*enter_memory_pressure)(void); atomic_t *memory_allocated; /* Current allocated memory. */ @@ -592,12 +600,38 @@ struct proto { #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif - struct { - int inuse; - u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; - } stats[NR_CPUS]; }; +/* + * Special macros to let protos use a fast version of inuse{get|add} + * using a static percpu variable per proto instead of an allocated one, + * saving one dereference. + * This might be changed if/when dynamic percpu vars become fast. + */ +#ifdef CONFIG_SMP +# define DEFINE_PROTO_INUSE(NAME) \ +static DEFINE_PER_CPU(int, NAME##_inuse); \ +static void NAME##_inuse_add(struct proto *prot, int inc) \ +{ \ + __get_cpu_var(NAME##_inuse) += inc; \ +} \ + \ +static int NAME##_inuse_getval(const struct proto *prot)\ +{ \ + int res = 0, cpu; \ + \ + for_each_possible_cpu(cpu) \ + res += per_cpu(NAME##_inuse, cpu); \ + return res; \ +} +# define REF_PROTO_INUSE(NAME) \ + .inuse_add = NAME##_inuse_add, \ + .inuse_getval = NAME##_inuse_getval, +#else +# define DEFINE_PROTO_INUSE(NAME) +# define REF_PROTO_INUSE(NAME) +#endif + extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); @@ -629,12 +663,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse++; +#ifdef CONFIG_SMP + prot->inuse_add(prot, 1); +#else + prot->inuse++; +#endif } static __inline__ void sock_prot_dec_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse--; +#ifdef CONFIG_SMP + prot->inuse_add(prot, -1); +#else + prot->inuse--; +#endif +} + +static __inline__ int sock_prot_inuse(struct proto *proto) +{ +#ifdef CONFIG_SMP + return proto->inuse_getval(proto); +#else + return proto->inuse; +#endif } /* With per-bucket locks this operation is not-atomic, so that diff --git a/net/core/sock.c b/net/core/sock.c index 12ad2067a98..e077f263b73 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1801,12 +1801,41 @@ EXPORT_SYMBOL(sk_common_release); static DEFINE_RWLOCK(proto_list_lock); static LIST_HEAD(proto_list); +#ifdef CONFIG_SMP +/* + * Define default functions to keep track of inuse sockets per protocol + * Note that often used protocols use dedicated functions to get a speed increase. + * (see DEFINE_PROTO_INUSE/REF_PROTO_INUSE) + */ +static void inuse_add(struct proto *prot, int inc) +{ + per_cpu_ptr(prot->inuse_ptr, smp_processor_id())[0] += inc; +} + +static int inuse_get(const struct proto *prot) +{ + int res = 0, cpu; + for_each_possible_cpu(cpu) + res += per_cpu_ptr(prot->inuse_ptr, cpu)[0]; + return res; +} +#endif + int proto_register(struct proto *prot, int alloc_slab) { char *request_sock_slab_name = NULL; char *timewait_sock_slab_name; int rc = -ENOBUFS; +#ifdef CONFIG_SMP + if (!prot->inuse_getval || !prot->inuse_add) { + prot->inuse_ptr = alloc_percpu(int); + if (prot->inuse_ptr == NULL) + goto out; + prot->inuse_getval = inuse_get; + prot->inuse_add = inuse_add; + } +#endif if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -1814,7 +1843,7 @@ int proto_register(struct proto *prot, int alloc_slab) if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", prot->name); - goto out; + goto out_free_inuse; } if (prot->rsk_prot != NULL) { @@ -1873,6 +1902,15 @@ out_free_request_sock_slab_name: out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; +out_free_inuse: +#ifdef CONFIG_SMP + if (prot->inuse_ptr != NULL) { + free_percpu(prot->inuse_ptr); + prot->inuse_ptr = NULL; + prot->inuse_getval = NULL; + prot->inuse_add = NULL; + } +#endif goto out; } @@ -1884,6 +1922,14 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); write_unlock(&proto_list_lock); +#ifdef CONFIG_SMP + if (prot->inuse_ptr != NULL) { + free_percpu(prot->inuse_ptr); + prot->inuse_ptr = NULL; + prot->inuse_getval = NULL; + prot->inuse_add = NULL; + } +#endif if (prot->slab != NULL) { kmem_cache_destroy(prot->slab); prot->slab = NULL; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ffdccc0972e..ce34b281803 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -46,17 +46,6 @@ #include #include -static int fold_prot_inuse(struct proto *proto) -{ - int res = 0; - int cpu; - - for_each_possible_cpu(cpu) - res += proto->stats[cpu].inuse; - - return res; -} - /* * Report socket allocation statistics [mea@utu.fi] */ @@ -64,12 +53,12 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", - fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); - seq_printf(seq, "UDPLITE: inuse %d\n", fold_prot_inuse(&udplite_prot)); - seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); + seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot)); + seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot)); seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues(), ip_frag_mem()); return 0; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index be526ad9254..8631ed7fe8a 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -32,27 +32,16 @@ static struct proc_dir_entry *proc_net_devsnmp6; -static int fold_prot_inuse(struct proto *proto) -{ - int res = 0; - int cpu; - - for_each_possible_cpu(cpu) - res += proto->stats[cpu].inuse; - - return res; -} - static int sockstat6_seq_show(struct seq_file *seq, void *v) { seq_printf(seq, "TCP6: inuse %d\n", - fold_prot_inuse(&tcpv6_prot)); + sock_prot_inuse(&tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", - fold_prot_inuse(&udpv6_prot)); + sock_prot_inuse(&udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", - fold_prot_inuse(&udplitev6_prot)); + sock_prot_inuse(&udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", - fold_prot_inuse(&rawv6_prot)); + sock_prot_inuse(&rawv6_prot)); seq_printf(seq, "FRAG6: inuse %d memory %d\n", ip6_frag_nqueues(), ip6_frag_mem()); return 0; -- cgit v1.2.3-70-g09d2 From 44656ba1286d82b5a5f8817eb2e4ea744143c3ca Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 7 Nov 2007 04:10:52 -0800 Subject: [NET]: Kill proc_net_create() There are no more users. Signed-off-by: David S. Miller --- fs/proc/proc_net.c | 7 ------- include/linux/proc_fs.h | 3 --- 2 files changed, 10 deletions(-) (limited to 'include') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 749def054a3..153554cf557 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -26,13 +26,6 @@ #include "internal.h" -struct proc_dir_entry *proc_net_create(struct net *net, - const char *name, mode_t mode, get_info_t *get_info) -{ - return create_proc_info_entry(name,mode, net->proc_net, get_info); -} -EXPORT_SYMBOL_GPL(proc_net_create); - struct proc_dir_entry *proc_net_fops_create(struct net *net, const char *name, mode_t mode, const struct file_operations *fops) { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 1ff46167206..1273c6ec535 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -196,8 +196,6 @@ static inline struct proc_dir_entry *create_proc_info_entry(const char *name, return res; } -extern struct proc_dir_entry *proc_net_create(struct net *net, - const char *name, mode_t mode, get_info_t *get_info); extern struct proc_dir_entry *proc_net_fops_create(struct net *net, const char *name, mode_t mode, const struct file_operations *fops); extern void proc_net_remove(struct net *net, const char *name); @@ -208,7 +206,6 @@ extern void proc_net_remove(struct net *net, const char *name); #define proc_bus NULL #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) -#define proc_net_create(net, name, mode, info) ({ (void)(mode), NULL; }) static inline void proc_net_remove(struct net *net, const char *name) {} static inline void proc_flush_task(struct task_struct *task) -- cgit v1.2.3-70-g09d2 From c3e9a353d8fc64a82ab11a07e21902e25e1e96d1 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 6 Nov 2007 23:34:04 -0800 Subject: [IPV4]: Compact some ifdefs in the fib code. There are places that check for CONFIG_IP_MULTIPLE_TABLES twice in the same file, but the internals of these #ifdefs can be merged. As a side effect - remove one ifdef from inside a function. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 15 ++++++--------- net/ipv4/fib_frontend.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 8cadc77c7df..ed514bfb61b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -185,6 +185,12 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result } #else /* CONFIG_IP_MULTIPLE_TABLES */ +extern void __init fib4_rules_init(void); + +#ifdef CONFIG_NET_CLS_ROUTE +extern u32 fib_rules_tclass(struct fib_result *res); +#endif + #define ip_fib_local_table fib_get_table(RT_TABLE_LOCAL) #define ip_fib_main_table fib_get_table(RT_TABLE_MAIN) @@ -214,15 +220,6 @@ extern __be32 __fib_res_prefsrc(struct fib_result *res); /* Exported by fib_hash.c */ extern struct fib_table *fib_hash_init(u32 id); -#ifdef CONFIG_IP_MULTIPLE_TABLES -extern void __init fib4_rules_init(void); - -#ifdef CONFIG_NET_CLS_ROUTE -extern u32 fib_rules_tclass(struct fib_result *res); -#endif - -#endif - static inline void fib_combine_itag(u32 *itag, struct fib_result *res) { #ifdef CONFIG_NET_CLS_ROUTE diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 60123905dbb..732d8f088b1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -59,6 +59,13 @@ struct fib_table *ip_fib_main_table; #define FIB_TABLE_HASHSZ 1 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +static void __init fib4_rules_init(void) +{ + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); + hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); +} #else #define FIB_TABLE_HASHSZ 256 @@ -905,14 +912,8 @@ void __init ip_fib_init(void) for (i = 0; i < FIB_TABLE_HASHSZ; i++) INIT_HLIST_HEAD(&fib_table_hash[i]); -#ifndef CONFIG_IP_MULTIPLE_TABLES - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); - hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); -#else + fib4_rules_init(); -#endif register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); -- cgit v1.2.3-70-g09d2 From 0fc00e2440b717e19bab1ae0015f03936bdf7967 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 7 Nov 2007 01:24:56 -0800 Subject: [TTY]: Fix network driver interactions with TCGET/SET calls. Dave Miller noted various cases where line disciplines for things like ppp go poking around in termios themselves in ways that broke with the new termios code. Rather than have them all learning about termios internals provide proper methods for this - tty_mode_ioctl() This handles all the terminal mode handling for speed/carrier etc and none of the methods are ldisc dependant so they can be called by any user - tty_perform_flush() This extracts the flush functionality and enables pppd the ppp layer to share it cleanly. The existing n_tty_ioctl code is refactored in this patch to provide the new functions and to call them itself appropriately. This patch has no (intended) behaviour changes and simply prepares for the other fixes. Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- drivers/char/tty_ioctl.c | 170 +++++++++++++++++++++++++++++------------------ include/linux/tty.h | 4 +- 2 files changed, 107 insertions(+), 67 deletions(-) (limited to 'include') diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index 7a003504c26..1bdd2bf4f37 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -730,13 +730,23 @@ static int send_prio_char(struct tty_struct *tty, char ch) return 0; } -int n_tty_ioctl(struct tty_struct * tty, struct file * file, - unsigned int cmd, unsigned long arg) +/** + * tty_mode_ioctl - mode related ioctls + * @tty: tty for the ioctl + * @file: file pointer for the tty + * @cmd: command + * @arg: ioctl argument + * + * Perform non line discipline specific mode control ioctls. This + * is designed to be called by line disciplines to ensure they provide + * consistent mode setting. + */ + +int tty_mode_ioctl(struct tty_struct * tty, struct file *file, + unsigned int cmd, unsigned long arg) { struct tty_struct * real_tty; void __user *p = (void __user *)arg; - int retval; - struct tty_ldisc *ld; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) @@ -799,6 +809,93 @@ int n_tty_ioctl(struct tty_struct * tty, struct file * file, return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETA: return set_termios(real_tty, p, TERMIOS_TERMIO); +#ifndef TCGETS2 + case TIOCGLCKTRMIOS: + if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios_locked)) + return -EFAULT; + return 0; + + case TIOCSLCKTRMIOS: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (user_termios_to_kernel_termios(real_tty->termios_locked, (struct termios __user *) arg)) + return -EFAULT; + return 0; +#else + case TIOCGLCKTRMIOS: + if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios_locked)) + return -EFAULT; + return 0; + + case TIOCSLCKTRMIOS: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (user_termios_to_kernel_termios_1(real_tty->termios_locked, (struct termios __user *) arg)) + return -EFAULT; + return 0; +#endif + case TIOCGSOFTCAR: + return put_user(C_CLOCAL(tty) ? 1 : 0, (int __user *)arg); + case TIOCSSOFTCAR: + if (get_user(arg, (unsigned int __user *) arg)) + return -EFAULT; + mutex_lock(&tty->termios_mutex); + tty->termios->c_cflag = + ((tty->termios->c_cflag & ~CLOCAL) | + (arg ? CLOCAL : 0)); + mutex_unlock(&tty->termios_mutex); + return 0; + default: + return -ENOIOCTLCMD; + } +} + +EXPORT_SYMBOL_GPL(tty_mode_ioctl); + +int tty_perform_flush(struct tty_struct *tty, unsigned long arg) +{ + struct tty_ldisc *ld; + int retval = tty_check_change(tty); + if (retval) + return retval; + + ld = tty_ldisc_ref(tty); + switch (arg) { + case TCIFLUSH: + if (ld && ld->flush_buffer) + ld->flush_buffer(tty); + break; + case TCIOFLUSH: + if (ld && ld->flush_buffer) + ld->flush_buffer(tty); + /* fall through */ + case TCOFLUSH: + if (tty->driver->flush_buffer) + tty->driver->flush_buffer(tty); + break; + default: + tty_ldisc_deref(ld); + return -EINVAL; + } + tty_ldisc_deref(ld); + return 0; +} + +EXPORT_SYMBOL_GPL(tty_perform_flush); + +int n_tty_ioctl(struct tty_struct * tty, struct file * file, + unsigned int cmd, unsigned long arg) +{ + struct tty_struct * real_tty; + int retval; + + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_MASTER) + real_tty = tty->link; + else + real_tty = tty; + + switch (cmd) { case TCXONC: retval = tty_check_change(tty); if (retval) @@ -829,30 +926,7 @@ int n_tty_ioctl(struct tty_struct * tty, struct file * file, } return 0; case TCFLSH: - retval = tty_check_change(tty); - if (retval) - return retval; - - ld = tty_ldisc_ref(tty); - switch (arg) { - case TCIFLUSH: - if (ld && ld->flush_buffer) - ld->flush_buffer(tty); - break; - case TCIOFLUSH: - if (ld && ld->flush_buffer) - ld->flush_buffer(tty); - /* fall through */ - case TCOFLUSH: - if (tty->driver->flush_buffer) - tty->driver->flush_buffer(tty); - break; - default: - tty_ldisc_deref(ld); - return -EINVAL; - } - tty_ldisc_deref(ld); - return 0; + return tty_perform_flush(tty, arg); case TIOCOUTQ: return put_user(tty->driver->chars_in_buffer ? tty->driver->chars_in_buffer(tty) : 0, @@ -862,32 +936,6 @@ int n_tty_ioctl(struct tty_struct * tty, struct file * file, if (L_ICANON(tty)) retval = inq_canon(tty); return put_user(retval, (unsigned int __user *) arg); -#ifndef TCGETS2 - case TIOCGLCKTRMIOS: - if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios_locked)) - return -EFAULT; - return 0; - - case TIOCSLCKTRMIOS: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (user_termios_to_kernel_termios(real_tty->termios_locked, (struct termios __user *) arg)) - return -EFAULT; - return 0; -#else - case TIOCGLCKTRMIOS: - if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios_locked)) - return -EFAULT; - return 0; - - case TIOCSLCKTRMIOS: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (user_termios_to_kernel_termios_1(real_tty->termios_locked, (struct termios __user *) arg)) - return -EFAULT; - return 0; -#endif - case TIOCPKT: { int pktmode; @@ -906,19 +954,9 @@ int n_tty_ioctl(struct tty_struct * tty, struct file * file, tty->packet = 0; return 0; } - case TIOCGSOFTCAR: - return put_user(C_CLOCAL(tty) ? 1 : 0, (int __user *)arg); - case TIOCSSOFTCAR: - if (get_user(arg, (unsigned int __user *) arg)) - return -EFAULT; - mutex_lock(&tty->termios_mutex); - tty->termios->c_cflag = - ((tty->termios->c_cflag & ~CLOCAL) | - (arg ? CLOCAL : 0)); - mutex_unlock(&tty->termios_mutex); - return 0; default: - return -ENOIOCTLCMD; + /* Try the mode commands */ + return tty_mode_ioctl(tty, file, cmd, arg); } } diff --git a/include/linux/tty.h b/include/linux/tty.h index 56164d7ba0a..c555f5442bd 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -332,7 +332,9 @@ extern void tty_ldisc_flush(struct tty_struct *tty); extern int tty_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); - +extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg); +extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); extern dev_t tty_devnum(struct tty_struct *tty); extern void proc_clear_tty(struct task_struct *p); extern struct tty_struct *get_current_tty(void); -- cgit v1.2.3-70-g09d2 From 1e356f9cdfa885c78791d5d6e5d2baef22f01853 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 7 Nov 2007 02:35:54 -0800 Subject: [IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov Signed-off-by: Rumen G. Bogdanovski Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ net/ipv4/ipvs/ip_vs_conn.c | 19 +++++++++++++++++++ net/ipv4/ipvs/ip_vs_ctl.c | 26 ++++++++++++++++++++++++++ net/ipv4/ipvs/ip_vs_sync.c | 24 ++++++++++++++++++++---- 4 files changed, 69 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 41870564df8..1fd1ee896f3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -901,6 +901,10 @@ extern int ip_vs_use_count_inc(void); extern void ip_vs_use_count_dec(void); extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); +extern struct ip_vs_dest * +ip_vs_find_dest(__be32 daddr, __be16 dport, + __be32 vaddr, __be16 vport, __u16 protocol); +extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); /* diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 4b702f708d3..b7eeae622d9 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -425,6 +425,25 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) } +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(cp->daddr, cp->dport, + cp->vaddr, cp->vport, cp->protocol); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} +EXPORT_SYMBOL(ip_vs_try_bind_dest); + + /* * Unbind a connection entry with its VS destination * Called by the ip_vs_conn_expire function. diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7345fc252a2..3c4d22a468e 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -579,6 +579,32 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) return NULL; } +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, + __be32 vaddr, __be16 vport, __u16 protocol) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(0, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} +EXPORT_SYMBOL(ip_vs_find_dest); /* * Lookup dest by {svc,addr,port} in the destination trash. diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 0d4d9721cbd..b1694d67abb 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -284,6 +284,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) struct ip_vs_sync_conn_options *opt; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_dest *dest; char *p; int i; @@ -317,20 +318,35 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) s->caddr, s->cport, s->vaddr, s->vport); if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(s->daddr, s->dport, + s->vaddr, s->vport, + s->protocol); cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, s->daddr, s->dport, - flags, NULL); + flags, dest); + if (dest) + atomic_dec(&dest->refcnt); if (!cp) { IP_VS_ERR("ip_vs_conn_new failed\n"); return; } cp->state = ntohs(s->state); } else if (!cp->dest) { - /* it is an entry created by the synchronization */ - cp->state = ntohs(s->state); - cp->flags = flags | IP_VS_CONN_F_HASHED; + dest = ip_vs_try_bind_dest(cp); + if (!dest) { + /* it is an unbound entry created by + * synchronization */ + cp->state = ntohs(s->state); + cp->flags = flags | IP_VS_CONN_F_HASHED; + } else + atomic_dec(&dest->refcnt); } /* Note that we don't touch its state and flags if it is a normal entry. */ -- cgit v1.2.3-70-g09d2 From efac52762b1e3fe3035d29e82d8ee1aebc45e4a7 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 7 Nov 2007 02:36:55 -0800 Subject: [IPVS]: Synchronize closing of Connections This patch makes the master daemon to sync the connection when it is about to close. This makes the connections on the backup to close or timeout according their state. Before the sync was performed only if the connection is in ESTABLISHED state which always made the connections to timeout in the hard coded 3 minutes. However the Andy Gospodarek's patch ([IPVS]: use proper timeout instead of fixed value) effectively did nothing more than increasing this to 15 minutes (Established state timeout). So this patch makes use of proper timeout since it syncs the connections on status changes to FIN_WAIT (2min timeout) and CLOSE (10sec timeout). However if the backup misses CLOSE hopefully it did not miss FIN_WAIT. Otherwise we will just have to wait for the ESTABLISHED state timeout. As it is without this patch. This way the number of the hanging connections on the backup is kept to minimum. And very few of them will be left to timeout with a long timeout. This is important if we want to make use of the fix for the real server overcommit on master/backup fail-over. Signed-off-by: Rumen G. Bogdanovski Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/net/ip_vs.h | 4 ++++ net/ipv4/ipvs/ip_vs_core.c | 20 ++++++++++++++------ net/ipv4/ipvs/ip_vs_sync.c | 2 +- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 1fd1ee896f3..67ea2c0c0ab 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -520,6 +520,10 @@ struct ip_vs_conn { spinlock_t lock; /* lock for state transition */ volatile __u16 flags; /* status flags */ volatile __u16 state; /* state info */ + volatile __u16 old_state; /* old state, to be used for + * state transition triggerd + * synchronization + */ /* Control members */ struct ip_vs_conn *control; /* Master control connection */ diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index c6ed7654e83..20c884a5772 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -979,15 +979,23 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, ret = NF_ACCEPT; } - /* increase its packet counter and check if it is needed - to be synchronized */ + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + */ atomic_inc(&cp->in_pkts); if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && - (cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE))))) ip_vs_sync_conn(cp); + cp->old_state = cp->state; ip_vs_conn_put(cp); return ret; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index b1694d67abb..bd930efc18d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -343,7 +343,6 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) if (!dest) { /* it is an unbound entry created by * synchronization */ - cp->state = ntohs(s->state); cp->flags = flags | IP_VS_CONN_F_HASHED; } else atomic_dec(&dest->refcnt); @@ -358,6 +357,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = ntohs(s->state); pp = ip_vs_proto_get(s->protocol); cp->timeout = pp->timeout_table[cp->state]; ip_vs_conn_put(cp); -- cgit v1.2.3-70-g09d2 From 230140cffa7feae90ad50bf259db1fa07674f3a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Nov 2007 02:40:20 -0800 Subject: [INET]: Remove per bucket rwlock in tcp/dccp ehash table. As done two years ago on IP route cache table (commit 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per hash bucket for the huge TCP/DCCP hash tables. On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle performance differences. (we hit a different cache line for the rwlock, but then the bucket cache line have a better sharing factor among cpus, since we dirty it less often). For netstat or ss commands that want a full scan of hash table, we perform fewer memory accesses. Using a 'small' table of hashed rwlocks should be more than enough to provide correct SMP concurrency between different buckets, without using too much memory. Sizing of this table depends on num_possible_cpus() and various CONFIG settings. This patch provides some locking abstraction that may ease a future work using a different model for TCP/DCCP table. Signed-off-by: Eric Dumazet Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 71 +++++++++++++++++++++++++++++++++++++++---- net/dccp/proto.c | 9 ++++-- net/ipv4/inet_diag.c | 9 +++--- net/ipv4/inet_hashtables.c | 7 +++-- net/ipv4/inet_timewait_sock.c | 13 ++++---- net/ipv4/tcp.c | 4 +-- net/ipv4/tcp_ipv4.c | 11 ++++--- net/ipv6/inet6_hashtables.c | 19 ++++++------ 8 files changed, 106 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4427dcd1e53..8461cda3749 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -37,7 +37,6 @@ * I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - rwlock_t lock; struct hlist_head chain; struct hlist_head twchain; }; @@ -100,6 +99,9 @@ struct inet_hashinfo { * TIME_WAIT sockets use a separate chain (twchain). */ struct inet_ehash_bucket *ehash; + rwlock_t *ehash_locks; + unsigned int ehash_size; + unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. @@ -107,7 +109,7 @@ struct inet_hashinfo { struct inet_bind_hashbucket *bhash; unsigned int bhash_size; - unsigned int ehash_size; + /* Note : 4 bytes padding on 64 bit arches */ /* All sockets in TCP_LISTEN state will be in here. This is the only * table where wildcard'd TCP sockets can exist. Hash function here @@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; } +static inline rwlock_t *inet_ehash_lockp( + struct inet_hashinfo *hashinfo, + unsigned int hash) +{ + return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; +} + +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, size = 256; +#if defined(CONFIG_PROVE_LOCKING) + unsigned int nr_pcpus = 2; +#else + unsigned int nr_pcpus = num_possible_cpus(); +#endif + if (nr_pcpus >= 4) + size = 512; + if (nr_pcpus >= 8) + size = 1024; + if (nr_pcpus >= 16) + size = 2048; + if (nr_pcpus >= 32) + size = 4096; + if (sizeof(rwlock_t) != 0) { +#ifdef CONFIG_NUMA + if (size * sizeof(rwlock_t) > PAGE_SIZE) + hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); + else +#endif + hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), + GFP_KERNEL); + if (!hashinfo->ehash_locks) + return ENOMEM; + for (i = 0; i < size; i++) + rwlock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = size - 1; + return 0; +} + +static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) +{ + if (hashinfo->ehash_locks) { +#ifdef CONFIG_NUMA + unsigned int size = (hashinfo->ehash_locks_mask + 1) * + sizeof(rwlock_t); + if (size > PAGE_SIZE) + vfree(hashinfo->ehash_locks); + else +#else + kfree(hashinfo->ehash_locks); +#endif + hashinfo->ehash_locks = NULL; + } +} + extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, @@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo, sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; - lock = &head->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock(lock); } __sk_add_node(sk, list); @@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { - lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock_bh(lock); } @@ -354,9 +412,10 @@ static inline struct sock * */ unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ @@ -369,7 +428,7 @@ static inline struct sock * } sk = NULL; out: - read_unlock(&head->lock); + read_unlock(lock); return sk; hit: sock_hold(sk); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d8497392803..7a3bea9c28c 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1072,11 +1072,13 @@ static int __init dccp_init(void) } for (i = 0; i < dccp_hashinfo.ehash_size; i++) { - rwlock_init(&dccp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); } + if (inet_ehash_locks_alloc(&dccp_hashinfo)) + goto out_free_dccp_ehash; + bhash_order = ehash_order; do { @@ -1091,7 +1093,7 @@ static int __init dccp_init(void) if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_ehash; + goto out_free_dccp_locks; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { @@ -1121,6 +1123,8 @@ out_free_dccp_mib: out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); dccp_hashinfo.bhash = NULL; +out_free_dccp_locks: + inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); dccp_hashinfo.ehash = NULL; @@ -1139,6 +1143,7 @@ static void __exit dccp_fini(void) free_pages((unsigned long)dccp_hashinfo.ehash, get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); + inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dc429b6b0ba..b0170732b5e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -747,13 +747,14 @@ skip_listen_ht: for (i = s_i; i < hashinfo->ehash_size; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + rwlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; struct hlist_node *node; if (i > s_i) s_num = 0; - read_lock_bh(&head->lock); + read_lock_bh(lock); num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); @@ -769,7 +770,7 @@ skip_listen_ht: r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_normal: @@ -791,14 +792,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_twsk_diag_dump(tw, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_dying: ++num; } } - read_unlock_bh(&head->lock); + read_unlock_bh(lock); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 16eecc7046a..67704da04fc 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -239,7 +240,7 @@ unique: BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp) { *twp = tw; @@ -255,7 +256,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 4e189e28f30..a60b99e0ebd 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; /* Unlink from established hashes. */ - struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - write_lock(&ehead->lock); + write_lock(lock); if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); + write_unlock(lock); return; } __hlist_del(&tw->tw_node); sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); + write_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; @@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in @@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(&ehead->lock); + write_lock(lock); /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) @@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_node(tw, &ehead->twchain); atomic_inc(&tw->tw_refcnt); - write_unlock(&ehead->lock); + write_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c64072bb504..8e65182f7af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2456,11 +2456,11 @@ void __init tcp_init(void) thash_entries ? 0 : 512 * 1024); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; for (i = 0; i < tcp_hashinfo.ehash_size; i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); } - + if (inet_ehash_locks_alloc(&tcp_hashinfo)) + panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e9127cdced2..e566f3c6767 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq) struct sock *sk; struct hlist_node *node; struct inet_timewait_sock *tw; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; @@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq) rc = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2094,11 +2095,11 @@ get_tw: cur = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; @@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); break; } } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d6f1026f194..adc73adadfa 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo, } else { unsigned int hash; sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (hashinfo->ehash_size - 1); - list = &hashinfo->ehash[hash].chain; - lock = &hashinfo->ehash[hash].lock; + list = &inet_ehash_bucket(hashinfo, hash)->chain; + lock = inet_ehash_lockp(hashinfo, hash); write_lock(lock); } @@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo, */ unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) @@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo, goto hit; } } - read_unlock(&head->lock); + read_unlock(lock); return NULL; hit: sock_hold(sk); - read_unlock(&head->lock); + read_unlock(lock); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); @@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -216,7 +217,7 @@ unique: __sk_add_node(sk, &head->chain); sk->sk_hash = hash; sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp != NULL) { *twp = tw; @@ -231,7 +232,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } -- cgit v1.2.3-70-g09d2 From c3d8d1e30cace31fed6186a4b8c6b1401836d89c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 7 Nov 2007 02:42:09 -0800 Subject: [NETLINK]: Fix unicast timeouts Commit ed6dcf4a in the history.git tree broke netlink_unicast timeouts by moving the schedule_timeout() call to a new function that doesn't propagate the remaining timeout back to the caller. This means on each retry we start with the full timeout again. ipc/mqueue.c seems to actually want to wait indefinitely so this behaviour is retained. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 +- ipc/mqueue.c | 6 ++++-- net/netlink/af_netlink.c | 10 +++++----- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 7c1f3b1d2ee..d5bfaba595c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -192,7 +192,7 @@ extern int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfilp(struct file *filp); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, - long timeo, struct sock *ssk); + long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index bfa274ba9ed..1e04cd464af 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1010,6 +1010,8 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, return -EINVAL; } if (notification.sigev_notify == SIGEV_THREAD) { + long timeo; + /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); ret = -ENOMEM; @@ -1038,8 +1040,8 @@ retry: goto out; } - ret = netlink_attachskb(sock, nc, 0, - MAX_SCHEDULE_TIMEOUT, NULL); + timeo = MAX_SCHEDULE_TIMEOUT; + ret = netlink_attachskb(sock, nc, 0, &timeo, NULL); if (ret == 1) goto retry; if (ret) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 26017125557..415c97236f6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -752,7 +752,7 @@ struct sock *netlink_getsockbyfilp(struct file *filp) * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, - long timeo, struct sock *ssk) + long *timeo, struct sock *ssk) { struct netlink_sock *nlk; @@ -761,7 +761,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); - if (!timeo) { + if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); @@ -775,7 +775,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) - timeo = schedule_timeout(timeo); + *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); @@ -783,7 +783,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, if (signal_pending(current)) { kfree_skb(skb); - return sock_intr_errno(timeo); + return sock_intr_errno(*timeo); } return 1; } @@ -877,7 +877,7 @@ retry: if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb); - err = netlink_attachskb(sk, skb, nonblock, timeo, ssk); + err = netlink_attachskb(sk, skb, nonblock, &timeo, ssk); if (err == 1) goto retry; if (err) -- cgit v1.2.3-70-g09d2