diff options
Diffstat (limited to 'net/netfilter')
62 files changed, 3110 insertions, 1210 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c37467562fd..e9410d17619 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -513,7 +513,6 @@ config NFT_QUEUE config NFT_REJECT depends on NF_TABLES - depends on NF_TABLES_IPV6 || !NF_TABLES_IPV6 default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" help @@ -521,6 +520,11 @@ config NFT_REJECT explicitly deny and notify via TCP reset/ICMP informational errors unallowed traffic. +config NFT_REJECT_INET + depends on NF_TABLES_INET + default NFT_REJECT + tristate + config NFT_COMPAT depends on NF_TABLES depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ee9c4de5f8e..bffdad774da 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -79,6 +79,7 @@ obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 44cd4f58adf..2f7f5c32c6f 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -61,6 +61,15 @@ config IP_SET_HASH_IP To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_IPMARK + tristate "hash:ip,mark set support" + depends on IP_SET + help + This option adds the hash:ip,mark set type support, by which one + can store IPv4/IPv6 address and mark pairs. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_IPPORT tristate "hash:ip,port set support" depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 44b2d38476f..231f10196cb 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o # hash types obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index de770ec39e5..ec8114fae50 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -54,10 +54,10 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex is held: */ -#define nfnl_dereference(p) \ +#define ip_set_dereference(p) \ rcu_dereference_protected(p, 1) -#define nfnl_set(inst, id) \ - nfnl_dereference((inst)->ip_set_list)[id] +#define ip_set(inst, id) \ + ip_set_dereference((inst)->ip_set_list)[id] /* * The set types are implemented in modules and registered set types @@ -271,10 +271,7 @@ ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); - if (is_vmalloc_addr(members)) - vfree(members); - else - kfree(members); + kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); @@ -368,6 +365,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) + set->flags |= IPSET_CREATE_FLAG_FORCEADD; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; @@ -510,7 +509,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) - return 0; + return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); @@ -533,7 +532,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) - return 0; + return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); @@ -640,7 +639,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) return IPSET_INVALID_ID; nfnl_lock(NFNL_SUBSYS_IPSET); - set = nfnl_set(inst, index); + set = ip_set(inst, index); if (set) __ip_set_get(set); else @@ -666,7 +665,7 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index) nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ - set = nfnl_set(inst, index); + set = ip_set(inst, index); if (set != NULL) __ip_set_put(set); } @@ -734,7 +733,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { - set = nfnl_set(inst, i); + set = ip_set(inst, i); if (set != NULL && STREQ(set->name, name)) { *id = i; break; @@ -760,7 +759,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s == NULL) { if (*index == IPSET_INVALID_ID) *index = i; @@ -883,7 +882,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = nfnl_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst->ip_set_list); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ @@ -900,7 +899,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, * Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); - nfnl_set(inst, index) = set; + ip_set(inst, index) = set; return ret; @@ -925,10 +924,10 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { static void ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) { - struct ip_set *set = nfnl_set(inst, index); + struct ip_set *set = ip_set(inst, index); pr_debug("set: %s\n", set->name); - nfnl_set(inst, index) = NULL; + ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -962,7 +961,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL && s->ref) { ret = -IPSET_ERR_BUSY; goto out; @@ -970,7 +969,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, } read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL) ip_set_destroy_set(inst, i); } @@ -1020,7 +1019,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL) ip_set_flush_set(s); } @@ -1074,7 +1073,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL && STREQ(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; @@ -1134,8 +1133,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); - nfnl_set(inst, from_id) = to; - nfnl_set(inst, to_id) = from; + ip_set(inst, from_id) = to; + ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; @@ -1157,7 +1156,7 @@ ip_set_dump_done(struct netlink_callback *cb) struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; if (cb->args[IPSET_CB_ARG0]) { pr_debug("release set %s\n", - nfnl_set(inst, cb->args[IPSET_CB_INDEX])->name); + ip_set(inst, cb->args[IPSET_CB_INDEX])->name); __ip_set_put_byindex(inst, (ip_set_id_t) cb->args[IPSET_CB_INDEX]); } @@ -1254,7 +1253,7 @@ dump_last: dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; - set = nfnl_set(inst, index); + set = ip_set(inst, index); if (set == NULL) { if (dump_type == DUMP_ONE) { ret = -ENOENT; @@ -1332,7 +1331,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", nfnl_set(inst, index)->name); + pr_debug("release set %s\n", ip_set(inst, index)->name); __ip_set_put_byindex(inst, index); cb->args[IPSET_CB_ARG0] = 0; } @@ -1887,7 +1886,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; if (id != IPSET_INVALID_ID) - req_get->family = nfnl_set(inst, id)->family; + req_get->family = ip_set(inst, id)->family; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } @@ -1901,7 +1900,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) goto done; } nfnl_lock(NFNL_SUBSYS_IPSET); - set = nfnl_set(inst, req_get->set.index); + set = ip_set(inst, req_get->set.index); strncpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); nfnl_unlock(NFNL_SUBSYS_IPSET); @@ -1945,7 +1944,6 @@ ip_set_net_init(struct net *net) return -ENOMEM; inst->is_deleted = 0; rcu_assign_pointer(inst->ip_set_list, list); - pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -1960,7 +1958,7 @@ ip_set_net_exit(struct net *net) inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ for (i = 0; i < inst->ip_set_max; i++) { - set = nfnl_set(inst, i); + set = ip_set(inst, i); if (set != NULL) ip_set_destroy_set(inst, i); } @@ -1996,6 +1994,7 @@ ip_set_init(void) nfnetlink_subsys_unregister(&ip_set_netlink_subsys); return ret; } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index be6932ad3a8..61c7fb05280 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -263,6 +263,9 @@ struct htype { u32 maxelem; /* max elements in the hash */ u32 elements; /* current element (vs timeout) */ u32 initval; /* random jhash init value */ +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; /* markmask value for mark mask to store */ +#endif struct timer_list gc; /* garbage collection when timeout enabled */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_MULTI @@ -454,6 +457,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) #ifdef IP_SET_HASH_WITH_NETMASK x->netmask == y->netmask && #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + x->markmask == y->markmask && +#endif a->extensions == b->extensions; } @@ -627,6 +633,18 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, bool flag_exist = flags & IPSET_FLAG_EXIST; u32 key, multi = 0; + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) { + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t,key); + if (n->pos) { + /* Choosing the first entry in the array to replace */ + j = 0; + goto reuse_slot; + } + rcu_read_unlock_bh(); + } if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) /* FIXME: when set is full, we slow down here */ mtype_expire(set, h, NLEN(set->family), set->dsize); @@ -908,6 +926,10 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) + goto nla_put_failure; +#endif if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; @@ -1016,6 +1038,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; +#endif u8 hbits; #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; @@ -1026,6 +1051,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; + +#ifdef IP_SET_HASH_WITH_MARKMASK + markmask = 0xffffffff; +#endif #ifdef IP_SET_HASH_WITH_NETMASK netmask = set->family == NFPROTO_IPV4 ? 32 : 128; pr_debug("Create set %s with family %s\n", @@ -1034,6 +1063,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || +#ifdef IP_SET_HASH_WITH_MARKMASK + !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || +#endif !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -1057,6 +1089,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, return -IPSET_ERR_INVALID_NETMASK; } #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (tb[IPSET_ATTR_MARKMASK]) { + markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + + if ((markmask > 4294967295u) || markmask == 0) + return -IPSET_ERR_INVALID_MARKMASK; + } +#endif hsize = sizeof(*h); #ifdef IP_SET_HASH_WITH_NETS @@ -1071,6 +1111,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifdef IP_SET_HASH_WITH_NETMASK h->netmask = netmask; #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + h->markmask = markmask; +#endif get_random_bytes(&h->initval, sizeof(h->initval)); set->timeout = IPSET_NO_TIMEOUT; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e65fc2423d5..dd40607f878 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -25,7 +25,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ -#define IPSET_TYPE_REV_MAX 2 /* Comments support */ +/* 2 Comments support */ +#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c new file mode 100644 index 00000000000..4eff0a29725 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -0,0 +1,321 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,mark type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); +IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,mark"); + +/* Type specific function prefix */ +#define HTYPE hash_ipmark +#define IP_SET_HASH_WITH_MARKMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipmark4_elem { + __be32 ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, + const struct hash_ipmark4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark4_data_list(struct sk_buff *skb, + const struct hash_ipmark4_elem *data) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark4_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark4_elem *d) +{ + next->ip = d->ip; +} + +#define MTYPE hash_ipmark4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip, ip_to = 0; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip = ntohl(e.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + e.ip = htonl(ip); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipmark6_elem { + union nf_inet_addr ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, + const struct hash_ipmark6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark6_data_list(struct sk_buff *skb, + const struct hash_ipmark6_elem *data) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark6_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_ipmark6 +#define PF 6 +#define HOST_MASK 128 +#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + + +static int +hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + + return ret; +} + +static struct ip_set_type hash_ipmark_type __read_mostly = { + .name = "hash:ip,mark", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipmark_create, + .create_policy = { + [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_MARK] = { .type = NLA_U32 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipmark_init(void) +{ + return ip_set_type_register(&hash_ipmark_type); +} + +static void __exit +hash_ipmark_fini(void) +{ + ip_set_type_unregister(&hash_ipmark_type); +} + +module_init(hash_ipmark_init); +module_exit(hash_ipmark_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 525a595dd1f..7597b82a8b0 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ -#define IPSET_TYPE_REV_MAX 3 /* Comments support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index f5636631466..672655ffd57 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ -#define IPSET_TYPE_REV_MAX 3 /* Comments support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5d87fe8a41f..7308d84f927 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -29,7 +29,8 @@ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ -#define IPSET_TYPE_REV_MAX 5 /* Comments support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 8295cf4f9fd..4c7d495783a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -26,7 +26,8 @@ /* 1 Range as input support for IPv4 added */ /* 2 nomatch flag support added */ /* 3 Counters support added */ -#define IPSET_TYPE_REV_MAX 4 /* Comments support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index b827a0f1f35..db2606805b3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -27,7 +27,8 @@ /* 1 nomatch flag support added */ /* 2 /0 support added */ /* 3 Counters support added */ -#define IPSET_TYPE_REV_MAX 4 /* Comments support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 6226803fc49..3e99987e4bf 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -24,7 +24,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -112,10 +112,10 @@ hash_netnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -334,10 +334,10 @@ hash_netnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 7097fb0141b..1c645fbd09c 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -28,7 +28,8 @@ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ -#define IPSET_TYPE_REV_MAX 5 /* Comments support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 703d1192a6a..c0d2ba73f8b 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -25,7 +25,8 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 /* Comments support added */ +/* 0 Comments support added */ +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 4f29fa97044..04d15fdc99e 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -7,8 +7,8 @@ #define E(a, b, c, d) \ {.ip6 = { \ - __constant_htonl(a), __constant_htonl(b), \ - __constant_htonl(c), __constant_htonl(d), \ + htonl(a), htonl(b), \ + htonl(c), htonl(d), \ } } /* diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 59a1a85bcb3..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -797,7 +797,6 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_control_del(cp); if (cp->flags & IP_VS_CONN_F_NFCT) { - ip_vs_conn_drop_conntrack(cp); /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -871,11 +870,11 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); - cp->vport = p->vport; - /* proto should only be IPPROTO_IP if d_addr is a fwmark */ + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + &cp->vaddr, p->vaddr); + cp->vport = p->vport; + ip_vs_addr_set(p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f26ee46b51..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -97,7 +97,7 @@ const char *ip_vs_proto_name(unsigned int proto) return "ICMPv6"; #endif default: - sprintf(buf, "IP_%d", proto); + sprintf(buf, "IP_%u", proto); return buf; } } @@ -1392,15 +1392,19 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (ipip) { __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; /* Update the MTU */ if (ic->type == ICMP_DEST_UNREACH && ic->code == ICMP_FRAG_NEEDED) { struct ip_vs_dest *dest = cp->dest; u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; /* Strip outer IP and ICMP, go to IPIP header */ - __skb_pull(skb, ihl + sizeof(_icmph)); + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", @@ -1408,7 +1412,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ipv4_update_pmtu(skb, dev_net(skb->dev), mtu, 0, 0, 0, 0); /* Client uses PMTUD? */ - if (!(cih->frag_off & htons(IP_DF))) + if (!(frag_off & htons(IP_DF))) goto ignore_ipip; /* Prefer the resulting PMTU */ if (dest) { @@ -1427,12 +1431,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* Strip outer IP, ICMP and IPIP, go to IP header of * original request. */ - __skb_pull(skb, offset2); + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, - ic->type, ic->code, ntohl(info)); - icmp_send(skb, ic->type, ic->code, info); + type, code, ntohl(info)); + icmp_send(skb, type, code, info); /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 35be035ee0c..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2177,10 +2177,10 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) __u64 inbytes, outbytes; do { - start = u64_stats_fetch_begin_bh(&u->syncp); + start = u64_stats_fetch_begin_irq(&u->syncp); inbytes = u->ustats.inbytes; outbytes = u->ustats.outbytes; - } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", i, u->ustats.conns, u->ustats.inpkts, @@ -3580,7 +3580,7 @@ out: } -static const struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .flags = GENL_ADMIN_PERM, @@ -3778,6 +3778,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } #else @@ -3840,7 +3841,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index ca056a331e6..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) spin_lock_bh(&svc->sched_lock); tbl->dead = 1; - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_del(en); atomic_dec(&tbl->entries); @@ -265,7 +265,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) unsigned long now = jiffies; int i, j; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); @@ -321,7 +321,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); @@ -340,7 +340,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) tbl->rover = j; out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } @@ -363,7 +363,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; @@ -536,8 +536,7 @@ out: /* * IPVS LBLC Scheduler structure */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index c47444e4cf8..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -562,7 +562,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); rcu_read_unlock(); @@ -590,7 +590,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); rcu_read_unlock(); @@ -684,7 +684,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); @@ -774,7 +774,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); rcu_read_unlock(); @@ -883,10 +883,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) @@ -974,7 +974,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) @@ -1023,7 +1023,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_send_check(ip_hdr(skb)); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); rcu_read_unlock(); @@ -1060,7 +1060,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); rcu_read_unlock(); @@ -1157,7 +1157,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_nat_icmp(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); @@ -1249,7 +1249,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_nat_icmp_v6(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8824ed0ccc9..1f4f954c4b4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -60,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, const struct nlattr *attr) __read_mostly; EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); -DEFINE_SPINLOCK(nf_conntrack_lock); -EXPORT_SYMBOL_GPL(nf_conntrack_lock); +__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +EXPORT_SYMBOL_GPL(nf_conntrack_locks); + +__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); + +static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) +{ + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + spin_unlock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_unlock(&nf_conntrack_locks[h2]); +} + +/* return true if we need to recompute hashes (in case hash table was resized) */ +static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, + unsigned int h2, unsigned int sequence) +{ + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + if (h1 <= h2) { + spin_lock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_lock_nested(&nf_conntrack_locks[h2], + SINGLE_DEPTH_NESTING); + } else { + spin_lock(&nf_conntrack_locks[h2]); + spin_lock_nested(&nf_conntrack_locks[h1], + SINGLE_DEPTH_NESTING); + } + if (read_seqcount_retry(&net->ct.generation, sequence)) { + nf_conntrack_double_unlock(h1, h2); + return true; + } + return false; +} + +static void nf_conntrack_all_lock(void) +{ + int i; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_nested(&nf_conntrack_locks[i], i); +} + +static void nf_conntrack_all_unlock(void) +{ + int i; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_unlock(&nf_conntrack_locks[i]); +} unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -192,6 +243,50 @@ clean_from_lists(struct nf_conn *ct) nf_ct_remove_expectations(ct); } +/* must be called with local_bh_disable */ +static void nf_ct_add_to_dying_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) dying list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->dying); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) unconfirmed list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->unconfirmed); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* We overload first tuple to link into unconfirmed or dying list.*/ + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&pcpu->lock); +} + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -203,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to nf_conntrack_lock!!! -HW */ rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -213,19 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct) rcu_read_unlock(); - spin_lock_bh(&nf_conntrack_lock); + local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, - * too. */ + * too. + */ nf_ct_remove_expectations(ct); - /* We overload first tuple to link into unconfirmed or dying list.*/ - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + nf_ct_del_from_dying_or_unconfirmed_list(ct); NF_CT_STAT_INC(net, delete); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (ct->master) nf_ct_put(ct->master); @@ -237,17 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct) static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + unsigned int hash, reply_hash; + u16 zone = nf_ct_zone(ct); + unsigned int sequence; nf_ct_helper_destroy(ct); - spin_lock_bh(&nf_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(net, delete_list); + + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + clean_from_lists(ct); - /* add this conntrack to the dying list */ - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.dying); - spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + + nf_ct_add_to_dying_list(ct); + + NF_CT_STAT_INC(net, delete_list); + local_bh_enable(); } static void death_by_event(unsigned long ul_conntrack) @@ -312,12 +414,25 @@ static void death_by_timeout(unsigned long ul_conntrack) nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); } +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, + const struct nf_conntrack_tuple *tuple, + u16 zone) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + /* A conntrack can be recreated with the equal tuple, + * so we need to check that the conntrack is confirmed + */ + return nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone && + nf_ct_is_confirmed(ct); +} + /* * Warning : * - Caller must take a reference on returned object * and recheck nf_ct_tuple_equal(tuple, &h->tuple) - * OR - * - Caller must lock nf_conntrack_lock before calling this function */ static struct nf_conntrack_tuple_hash * ____nf_conntrack_find(struct net *net, u16 zone, @@ -333,8 +448,7 @@ ____nf_conntrack_find(struct net *net, u16 zone, local_bh_disable(); begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { + if (nf_ct_key_equal(h, tuple, zone)) { NF_CT_STAT_INC(net, found); local_bh_enable(); return h; @@ -372,8 +486,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || - nf_ct_zone(ct) != zone)) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { nf_ct_put(ct); goto begin; } @@ -395,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, - unsigned int repl_hash) + unsigned int reply_hash) { struct net *net = nf_ct_net(ct); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &net->ct.hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[repl_hash]); + &net->ct.hash[reply_hash]); } int nf_conntrack_hash_check_insert(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; u16 zone; + unsigned int sequence; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - spin_lock_bh(&nf_conntrack_lock); + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) @@ -428,32 +545,57 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; add_timer(&ct->timeout); - nf_conntrack_get(&ct->ct_general); - __nf_conntrack_hash_insert(ct, hash, repl_hash); + smp_wmb(); + /* The caller holds a reference to this object */ + atomic_set(&ct->ct_general.use, 2); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); - spin_unlock_bh(&nf_conntrack_lock); - + local_bh_enable(); return 0; out: + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); return -EEXIST; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ + struct ct_pcpu *pcpu; + + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + + /* add this conntrack to the (per cpu) tmpl list */ + local_bh_disable(); + tmpl->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); + + spin_lock(&pcpu->lock); + /* Overload tuple linked list to put us in template list. */ + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->tmpl); + spin_unlock_bh(&pcpu->lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; @@ -462,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; u16 zone; + unsigned int sequence; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -474,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; zone = nf_ct_zone(ct); - /* reuse the hash saved before */ - hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); - repl_hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + local_bh_disable(); + + do { + sequence = read_seqcount_begin(&net->ct.generation); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related - connections for unconfirmed conns. But packet copies and - REJECT will give spurious warnings here. */ + * connections for unconfirmed conns. But packet copies and + * REJECT will give spurious warnings here. + */ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ /* No external references means no one else could have - confirmed us. */ + * confirmed us. + */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); pr_debug("Confirming conntrack %p\n", ct); - - spin_lock_bh(&nf_conntrack_lock); - /* We have to check the DYING flag inside the lock to prevent a race against nf_ct_get_next_corpse() possibly called from user context, else we insert an already 'dead' hash, blocking further use of that particular connection -JM */ if (unlikely(nf_ct_is_dying(ct))) { - spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + local_bh_enable(); return NF_ACCEPT; } @@ -510,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb) &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - /* Remove from unconfirmed list */ - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + nf_ct_del_from_dying_or_unconfirmed_list(ct); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in @@ -540,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) * guarantee that no other CPU can find the conntrack before the above * stores are visible. */ - __nf_conntrack_hash_insert(ct, hash, repl_hash); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); help = nfct_help(ct); if (help && help->helper) @@ -553,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); return NF_DROP; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -597,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; struct nf_conn *ct = NULL, *tmp; struct hlist_nulls_node *n; - unsigned int i, cnt = 0; + unsigned int i = 0, cnt = 0; int dropped = 0; + unsigned int hash, sequence; + spinlock_t *lockp; - rcu_read_lock(); - for (i = 0; i < net->ct.htable_size; i++) { + local_bh_disable(); +restart: + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_bucket(_hash, net); + for (; i < net->ct.htable_size; i++) { + lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (read_seqcount_retry(&net->ct.generation, sequence)) { + spin_unlock(lockp); + goto restart; + } hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) + if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && + !nf_ct_is_dying(tmp) && + atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; + break; + } cnt++; } - if (ct != NULL) { - if (likely(!nf_ct_is_dying(ct) && - atomic_inc_not_zero(&ct->ct_general.use))) - break; - else - ct = NULL; - } + hash = (hash + 1) % net->ct.htable_size; + spin_unlock(lockp); - if (cnt >= NF_CT_EVICTION_RANGE) + if (ct || cnt >= NF_CT_EVICTION_RANGE) break; - hash = (hash + 1) % net->ct.htable_size; } - rcu_read_unlock(); + local_bh_enable(); if (!ct) return dropped; @@ -678,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { - if (!early_drop(net, hash_bucket(hash, net))) { + if (!early_drop(net, hash)) { atomic_dec(&net->ct.count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); @@ -720,11 +879,10 @@ __nf_conntrack_alloc(struct net *net, u16 zone, nf_ct_zone->id = zone; } #endif - /* - * changes to lookup keys must be done before setting refcnt to 1 + /* Because we use RCU lookups, we set ct_general.use to zero before + * this is inserted in any list. */ - smp_wmb(); - atomic_set(&ct->ct_general.use, 1); + atomic_set(&ct->ct_general.use, 0); return ct; #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -748,10 +906,15 @@ void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + /* A freed object has refcnt == 0, that's + * the golden rule for SLAB_DESTROY_BY_RCU + */ + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); + nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -771,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_ecache *ecache; - struct nf_conntrack_expect *exp; + struct nf_conntrack_expect *exp = NULL; u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; struct nf_conn_timeout *timeout_ext; unsigned int *timeouts; @@ -815,39 +978,44 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, ecache ? ecache->expmask : 0, GFP_ATOMIC); - spin_lock_bh(&nf_conntrack_lock); - exp = nf_ct_find_expectation(net, zone, tuple); - if (exp) { - pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", - ct, exp); - /* Welcome, Mr. Bond. We've been expecting you... */ - __set_bit(IPS_EXPECTED_BIT, &ct->status); - ct->master = exp->master; - if (exp->helper) { - help = nf_ct_helper_ext_add(ct, exp->helper, - GFP_ATOMIC); - if (help) - rcu_assign_pointer(help->helper, exp->helper); - } + local_bh_disable(); + if (net->ct.expect_count) { + spin_lock(&nf_conntrack_expect_lock); + exp = nf_ct_find_expectation(net, zone, tuple); + if (exp) { + pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + ct, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &ct->status); + /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ + ct->master = exp->master; + if (exp->helper) { + help = nf_ct_helper_ext_add(ct, exp->helper, + GFP_ATOMIC); + if (help) + rcu_assign_pointer(help->helper, exp->helper); + } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = exp->master->mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK - ct->secmark = exp->master->secmark; + ct->secmark = exp->master->secmark; #endif - nf_conntrack_get(&ct->master->ct_general); - NF_CT_STAT_INC(net, expect_new); - } else { + NF_CT_STAT_INC(net, expect_new); + } + spin_unlock(&nf_conntrack_expect_lock); + } + if (!exp) { __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); NF_CT_STAT_INC(net, new); } - /* Overload tuple linked list to put us in unconfirmed list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.unconfirmed); + /* Now it is inserted into the unconfirmed list, bump refcount */ + nf_conntrack_get(&ct->ct_general); + nf_ct_add_to_unconfirmed_list(ct); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (exp) { if (exp->expectfn) @@ -1217,27 +1385,42 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; + int cpu; + spinlock_t *lockp; - spin_lock_bh(&nf_conntrack_lock); for (; *bucket < net->ct.htable_size; (*bucket)++) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; + local_bh_disable(); + spin_lock(lockp); + if (*bucket < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } + spin_unlock(lockp); + local_bh_enable(); + } + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) - goto found; + set_bit(IPS_DYING_BIT, &ct->status); } + spin_unlock_bh(&pcpu->lock); } - hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - spin_unlock_bh(&nf_conntrack_lock); return NULL; found: atomic_inc(&ct->ct_general.use); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock(lockp); + local_bh_enable(); return ct; } @@ -1286,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; + int cpu; - spin_lock_bh(&nf_conntrack_lock); - hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - /* never fails to remove them, no listeners at this point */ - nf_ct_kill(ct); + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&pcpu->lock); } - spin_unlock_bh(&nf_conntrack_lock); } static int untrack_refs(void) @@ -1380,6 +1568,7 @@ i_see_dead_people: kmem_cache_destroy(net->ct.nf_conntrack_cachep); kfree(net->ct.slabname); free_percpu(net->ct.stat); + free_percpu(net->ct.pcpu_lists); } } @@ -1432,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; + local_bh_disable(); + nf_conntrack_all_lock(); + write_seqcount_begin(&init_net.ct.generation); + /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash - * though since that required taking the lock. + * though since that required taking the locks. */ - spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < init_net.ct.htable_size; i++) { while (!hlist_nulls_empty(&init_net.ct.hash[i])) { h = hlist_nulls_entry(init_net.ct.hash[i].first, @@ -1454,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; init_net.ct.hash = hash; - spin_unlock_bh(&nf_conntrack_lock); + + write_seqcount_end(&init_net.ct.generation); + nf_conntrack_all_unlock(); + local_bh_enable(); nf_ct_free_hashtable(old_hash, old_size); return 0; @@ -1476,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int ret, cpu; + int i, ret, cpu; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_init(&nf_conntrack_locks[i]); /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1592,37 +1791,44 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - int ret; + int ret = -ENOMEM; + int cpu; atomic_set(&net->ct.count, 0); - INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL); - net->ct.stat = alloc_percpu(struct ip_conntrack_stat); - if (!net->ct.stat) { - ret = -ENOMEM; + seqcount_init(&net->ct.generation); + + net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); + if (!net->ct.pcpu_lists) goto err_stat; + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_init(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) + goto err_pcpu_lists; + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); - if (!net->ct.slabname) { - ret = -ENOMEM; + if (!net->ct.slabname) goto err_slabname; - } net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, sizeof(struct nf_conn), 0, SLAB_DESTROY_BY_RCU, NULL); if (!net->ct.nf_conntrack_cachep) { printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - ret = -ENOMEM; goto err_cache; } net->ct.htable_size = nf_conntrack_htable_size; net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); if (!net->ct.hash) { - ret = -ENOMEM; printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_hash; } @@ -1664,6 +1870,8 @@ err_cache: kfree(net->ct.slabname); err_slabname: free_percpu(net->ct.stat); +err_pcpu_lists: + free_percpu(net->ct.pcpu_lists); err_stat: return ret; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4fd1ca94fd4..f87e8f68ad4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -66,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) { struct nf_conntrack_expect *exp = (void *)ul_expect; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } @@ -155,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone, if (!nf_ct_is_confirmed(exp->master)) return NULL; + /* Avoid race with other CPUs, that for exp->master ct, is + * about to invoke ->destroy(), or nf_ct_delete() via timeout + * or early_drop(). + * + * The atomic_inc_not_zero() check tells: If that fails, we + * know that the ct is being destroyed. If it succeeds, we + * can be sure the ct cannot disappear underneath. + */ + if (unlikely(nf_ct_is_dying(exp->master) || + !atomic_inc_not_zero(&exp->master->ct_general.use))) + return NULL; + if (exp->flags & NF_CT_EXPECT_PERMANENT) { atomic_inc(&exp->use); return exp; @@ -162,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone, nf_ct_unlink_expect(exp); return exp; } + /* Undo exp->master refcnt increase, if del_timer() failed */ + nf_ct_put(exp->master); return NULL; } @@ -177,12 +191,14 @@ void nf_ct_remove_expectations(struct nf_conn *ct) if (!help) return; + spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); @@ -217,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); @@ -335,7 +351,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); helper = rcu_dereference_protected(master_help->helper, - lockdep_is_held(&nf_conntrack_lock)); + lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; @@ -395,7 +411,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, - lockdep_is_held(&nf_conntrack_lock)); + lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && @@ -417,12 +433,12 @@ out: return ret; } -int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report) { int ret; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect); if (ret <= 0) goto out; @@ -430,11 +446,11 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, ret = nf_ct_expect_insert(expect); if (ret < 0) goto out; - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 70866d192ef..3a3a60b126e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1476,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_refresh(ct, skb, info->timeout * HZ); /* Set expect timeout */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, info->sig_port[!dir]); if (exp) { @@ -1486,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_dump_tuple(&exp->tuple); set_expect_timeout(exp, info->timeout); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 974a2a4adef..5b3eae7d4c9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -250,16 +250,14 @@ out: } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* appropiate ct lock protecting must be taken by caller */ static inline int unhelp(struct nf_conntrack_tuple_hash *i, const struct nf_conntrack_helper *me) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); - if (help && rcu_dereference_protected( - help->helper, - lockdep_is_held(&nf_conntrack_lock) - ) == me) { + if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } @@ -284,17 +282,17 @@ static LIST_HEAD(nf_ct_helper_expectfn_list); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); list_del_rcu(&n->head); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); @@ -396,15 +394,17 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, const struct hlist_node *next; const struct hlist_nulls_node *nn; unsigned int i; + int cpu; /* Get rid of expectations */ + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, - lockdep_is_held(&nf_conntrack_lock) + lockdep_is_held(&nf_conntrack_expect_lock) ) == me || exp->helper == me) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); @@ -412,14 +412,27 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } } } + spin_unlock_bh(&nf_conntrack_expect_lock); /* Get rid of expecteds, set helpers to NULL. */ - hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) - unhelp(h, me); - for (i = 0; i < net->ct.htable_size; i++) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) unhelp(h, me); + spin_unlock_bh(&pcpu->lock); + } + local_bh_disable(); + for (i = 0; i < net->ct.htable_size; i++) { + spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + if (i < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + unhelp(h, me); + } + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); } + local_bh_enable(); } void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) @@ -437,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) synchronize_rcu(); rtnl_lock(); - spin_lock_bh(&nf_conntrack_lock); for_each_net(net) __nf_conntrack_helper_unregister(me, net); - spin_unlock_bh(&nf_conntrack_lock); rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bb322d0beb4..300ed1eec72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -597,6 +597,9 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) ; @@ -764,14 +767,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; int res; + spinlock_t *lockp; + #ifdef CONFIG_NF_CONNTRACK_MARK const struct ctnetlink_dump_filter *filter = cb->data; #endif - spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; + + local_bh_disable(); for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: + lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (cb->args[0] >= net->ct.htable_size) { + spin_unlock(lockp); + goto out; + } hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -803,16 +815,18 @@ restart: if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; + spin_unlock(lockp); goto out; } } + spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (last) nf_ct_put(last); @@ -966,7 +980,6 @@ ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, return 0; } -#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, @@ -984,9 +997,9 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, [CTA_LABELS] = { .type = NLA_BINARY, - .len = __CTA_LABELS_MAX_LENGTH }, + .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, - .len = __CTA_LABELS_MAX_LENGTH }, + .len = NF_CT_LABELS_MAX_SIZE }, }; static int @@ -1138,8 +1151,7 @@ static int ctnetlink_done_list(struct netlink_callback *cb) } static int -ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, - struct hlist_nulls_head *list) +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) { struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; @@ -1147,41 +1159,57 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; int res; + int cpu; + struct hlist_nulls_head *list; + struct net *net = sock_net(skb->sk); if (cb->args[2]) return 0; - spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; -restart: - hlist_nulls_for_each_entry(h, n, list, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (l3proto && nf_ct_l3num(ct) != l3proto) + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + struct ct_pcpu *pcpu; + + if (!cpu_possible(cpu)) continue; - if (cb->args[1]) { - if (ct != last) + + pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + spin_lock_bh(&pcpu->lock); + list = dying ? &pcpu->dying : &pcpu->unconfirmed; +restart: + hlist_nulls_for_each_entry(h, n, list, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (l3proto && nf_ct_l3num(ct) != l3proto) continue; - cb->args[1] = 0; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } + rcu_read_lock(); + res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { + if (!atomic_inc_not_zero(&ct->ct_general.use)) + continue; + cb->args[0] = cpu; + cb->args[1] = (unsigned long)ct; + spin_unlock_bh(&pcpu->lock); + goto out; + } } - rcu_read_lock(); - res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); - if (res < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; - goto out; + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; } + spin_unlock_bh(&pcpu->lock); } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; - } else - cb->args[2] = 1; + cb->args[2] = 1; out: - spin_unlock_bh(&nf_conntrack_lock); if (last) nf_ct_put(last); @@ -1191,9 +1219,7 @@ out: static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - - return ctnetlink_dump_list(skb, cb, &net->ct.dying); + return ctnetlink_dump_list(skb, cb, true); } static int @@ -1215,9 +1241,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, static int ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - - return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed); + return ctnetlink_dump_list(skb, cb, false); } static int @@ -1310,27 +1334,25 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) } static int -ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_NAT_NEEDED int ret; - if (cda[CTA_NAT_DST]) { - ret = ctnetlink_parse_nat_setup(ct, - NF_NAT_MANIP_DST, - cda[CTA_NAT_DST]); - if (ret < 0) - return ret; - } - if (cda[CTA_NAT_SRC]) { - ret = ctnetlink_parse_nat_setup(ct, - NF_NAT_MANIP_SRC, - cda[CTA_NAT_SRC]); - if (ret < 0) - return ret; - } - return 0; + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + return ret; #else + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; return -EOPNOTSUPP; #endif } @@ -1366,14 +1388,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) nf_ct_protonum(ct)); if (helper == NULL) { #ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); return -EOPNOTSUPP; } - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper) @@ -1659,11 +1681,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } - if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { - err = ctnetlink_change_nat(ct, cda); - if (err < 0) - goto err2; - } + err = ctnetlink_setup_nat(ct, cda); + if (err < 0) + goto err2; nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); @@ -1811,9 +1831,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -2023,6 +2043,9 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) ; } @@ -2142,9 +2165,9 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } @@ -2699,13 +2722,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } /* after list removal, usage count == 1 */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_expect_put(exp); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); @@ -2714,7 +2737,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conn_help *m_help; /* delete all expectations for this helper */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], @@ -2729,10 +2752,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } } } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } else { /* This basically means we have to flush everything*/ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], @@ -2745,7 +2768,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } } } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; @@ -2971,11 +2994,11 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = __nf_ct_expect_find(net, zone, &tuple); if (!exp) { - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { err = ctnetlink_create_expect(net, zone, cda, @@ -2989,7 +3012,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, err = -EEXIST; if (!(nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return err; } diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7bd03decd36..825c3e3f830 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -605,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = { .expect_policy = &pptp_exp_policy, }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ - nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { - .exit = nf_conntrack_pptp_net_exit, -}; - static int __init nf_conntrack_pptp_init(void) { - int rv; - - rv = nf_conntrack_helper_register(&pptp); - if (rv < 0) - return rv; - rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); - if (rv < 0) - nf_conntrack_helper_unregister(&pptp); - return rv; + return nf_conntrack_helper_register(&pptp); } static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9d9c0dade60..d5665739e3b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -66,7 +66,7 @@ static inline struct netns_proto_gre *gre_pernet(struct net *net) return net_generic(net, proto_gre_net_id); } -void nf_ct_gre_keymap_flush(struct net *net) +static void nf_ct_gre_keymap_flush(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km, *tmp; @@ -78,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net) } write_unlock_bh(&net_gre->keymap_lock); } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush); static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 466410eaa48..4c3ba1c8d68 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -800,7 +800,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct, struct hlist_node *next; int found = 0; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (exp->class != SIP_EXPECT_SIGNALLING || !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || @@ -815,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct, found = 1; break; } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return found; } @@ -825,7 +825,7 @@ static void flush_expectations(struct nf_conn *ct, bool media) struct nf_conntrack_expect *exp; struct hlist_node *next; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) continue; @@ -836,7 +836,7 @@ static void flush_expectations(struct nf_conn *ct, bool media) if (!media) break; } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index d3f5cd6dd96..a49907b1dab 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -358,6 +358,19 @@ out: rcu_read_unlock(); } +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + if (nat) + return nat; + + if (!nf_ct_is_confirmed(ct)) + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + + return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); + unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, @@ -368,14 +381,9 @@ nf_nat_setup_info(struct nf_conn *ct, struct nf_conn_nat *nat; /* nat helper or nfctnetlink also setup binding */ - nat = nfct_nat(ct); - if (!nat) { - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); @@ -432,15 +440,15 @@ nf_nat_setup_info(struct nf_conn *ct, } EXPORT_SYMBOL(nf_nat_setup_info); -unsigned int -nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +static unsigned int +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = - (HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? + (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range range = { @@ -448,7 +456,13 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) .min_addr = ip, .max_addr = ip, }; - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); + return nf_nat_setup_info(ct, &range, manip); +} + +unsigned int +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); @@ -511,6 +525,39 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + + if (nf_nat_proto_remove(ct, data)) + return 1; + + if (!nat || !nat->ct) + return 0; + + /* This netns is being destroyed, and conntrack has nat null binding. + * Remove it from bysource hash, as the table will be freed soon. + * + * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() + * will delete entry from already-freed table. + */ + if (!del_timer(&ct->timeout)) + return 1; + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + ct->status &= ~IPS_NAT_DONE_MASK; + nat->ct = NULL; + spin_unlock_bh(&nf_nat_lock); + + add_timer(&ct->timeout); + + /* don't delete conntrack. Although that would make things a lot + * simpler, we'd end up flushing all conntracks on nat rmmod. + */ + return 0; +} + static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) { struct nf_nat_proto_clean clean = { @@ -702,9 +749,9 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { static int nfnetlink_parse_nat(const struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range) + const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_nat_l3proto *l3proto) { - const struct nf_nat_l3proto *l3proto; struct nlattr *tb[CTA_NAT_MAX+1]; int err; @@ -714,38 +761,46 @@ nfnetlink_parse_nat(const struct nlattr *nat, if (err < 0) return err; - rcu_read_lock(); - l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); - if (l3proto == NULL) { - err = -EAGAIN; - goto out; - } err = l3proto->nlattr_to_range(tb, range); if (err < 0) - goto out; + return err; if (!tb[CTA_NAT_PROTO]) - goto out; + return 0; - err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); -out: - rcu_read_unlock(); - return err; + return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } +/* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range range; + const struct nf_nat_l3proto *l3proto; int err; - err = nfnetlink_parse_nat(attr, ct, &range); + /* Should not happen, restricted to creating new conntracks + * via ctnetlink. + */ + if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) + return -EEXIST; + + /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to + * attach the null binding, otherwise this may oops. + */ + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + if (l3proto == NULL) + return -EAGAIN; + + /* No NAT information has been passed, allocate the null-binding */ + if (attr == NULL) + return __nf_nat_alloc_null_binding(ct, manip); + + err = nfnetlink_parse_nat(attr, ct, &range, l3proto); if (err < 0) return err; - if (nf_nat_initialized(ct, manip)) - return -EEXIST; return nf_nat_setup_info(ct, &range, manip); } @@ -773,7 +828,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); + nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 9858e3e51a3..52e20c9a46a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -363,9 +363,8 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_tmpl_insert(net, ct); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -390,7 +389,7 @@ static void __net_exit synproxy_net_exit(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - nf_conntrack_free(snet->tmpl); + nf_ct_put(snet->tmpl); synproxy_proc_exit(net); free_percpu(snet->stats); } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 117bbaaddde..8746ff9a835 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -35,7 +35,7 @@ int nft_register_afinfo(struct net *net, struct nft_af_info *afi) { INIT_LIST_HEAD(&afi->tables); nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&afi->list, &net->nft.af_info); + list_add_tail_rcu(&afi->list, &net->nft.af_info); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(nft_register_afinfo); void nft_unregister_afinfo(struct nft_af_info *afi) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&afi->list); + list_del_rcu(&afi->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_afinfo); @@ -88,6 +88,45 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) return ERR_PTR(-EAFNOSUPPORT); } +static void nft_ctx_init(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nft_af_info *afi, + struct nft_table *table, + struct nft_chain *chain, + const struct nlattr * const *nla) +{ + ctx->net = sock_net(skb->sk); + ctx->afi = afi; + ctx->table = table; + ctx->chain = chain; + ctx->nla = nla; + ctx->portid = NETLINK_CB(skb).portid; + ctx->report = nlmsg_report(nlh); + ctx->seq = nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, + u32 size) +{ + struct nft_trans *trans; + + trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); + if (trans == NULL) + return NULL; + + trans->msg_type = msg_type; + trans->ctx = *ctx; + + return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + list_del(&trans->list); + kfree(trans); +} + /* * Tables */ @@ -152,8 +191,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, #ifdef CONFIG_MODULES if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%*.s", afi->family, - nla_len(nla)-1, (const char *)nla_data(nla)); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); type = __nf_tables_chain_type_lookup(afi->family, nla); if (type != NULL) @@ -197,20 +236,13 @@ nla_put_failure: return -1; } -static int nf_tables_table_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - int event, int family) +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - u32 seq = nlh ? nlh->nlmsg_seq : 0; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - bool report; int err; - report = nlh ? nlmsg_report(nlh) : false; - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -218,18 +250,20 @@ static int nf_tables_table_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, portid, seq, event, 0, - family, table); + err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -243,11 +277,14 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -260,15 +297,21 @@ static int nf_tables_dump_tables(struct sk_buff *skb, NLM_F_MULTI, afi->family, table) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: + rcu_read_unlock(); cb->args[0] = idx; return skb->len; } +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -295,6 +338,8 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -343,7 +388,7 @@ err: return err; } -static int nf_tables_table_disable(const struct nft_af_info *afi, +static void nf_tables_table_disable(const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; @@ -353,45 +398,66 @@ static int nf_tables_table_disable(const struct nft_af_info *afi, nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); } - - return 0; } -static int nf_tables_updtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct nft_af_info *afi, struct nft_table *table) +static int nf_tables_updtable(struct nft_ctx *ctx) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family, ret = 0; + struct nft_trans *trans; + u32 flags; + int ret = 0; - if (nla[NFTA_TABLE_FLAGS]) { - u32 flags; + if (!ctx->nla[NFTA_TABLE_FLAGS]) + return 0; - flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; - if ((flags & NFT_TABLE_F_DORMANT) && - !(table->flags & NFT_TABLE_F_DORMANT)) { - ret = nf_tables_table_disable(afi, table); - if (ret >= 0) - table->flags |= NFT_TABLE_F_DORMANT; - } else if (!(flags & NFT_TABLE_F_DORMANT) && - table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(afi, table); - if (ret >= 0) - table->flags &= ~NFT_TABLE_F_DORMANT; + if (flags == ctx->table->flags) + return 0; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, + sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if ((flags & NFT_TABLE_F_DORMANT) && + !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { + nft_trans_table_enable(trans) = false; + } else if (!(flags & NFT_TABLE_F_DORMANT) && + ctx->table->flags & NFT_TABLE_F_DORMANT) { + ret = nf_tables_table_enable(ctx->afi, ctx->table); + if (ret >= 0) { + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + nft_trans_table_enable(trans) = true; } - if (ret < 0) - goto err; } + if (ret < 0) + goto err; - nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family); + nft_trans_table_update(trans) = true; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; err: + nft_trans_destroy(trans); return ret; } +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -403,6 +469,8 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; + struct nft_ctx ctx; + int err; afi = nf_tables_afinfo_lookup(net, family, true); if (IS_ERR(afi)) @@ -417,11 +485,15 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, } if (table != NULL) { + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updtable(nlsk, skb, nlh, nla, afi, table); + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + return nf_tables_updtable(&ctx); } if (nla[NFTA_TABLE_FLAGS]) { @@ -444,8 +516,14 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&table->sets); table->flags = flags; - list_add_tail(&table->list, &afi->tables); - nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family); + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); + if (err < 0) { + kfree(table); + module_put(afi->owner); + return err; + } + list_add_tail_rcu(&table->list, &afi->tables); return 0; } @@ -457,7 +535,8 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, struct nft_af_info *afi; struct nft_table *table; struct net *net = sock_net(skb->sk); - int family = nfmsg->nfgen_family; + int family = nfmsg->nfgen_family, err; + struct nft_ctx ctx; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -466,17 +545,28 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); - - if (!list_empty(&table->chains) || !list_empty(&table->sets)) + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + if (table->use > 0) return -EBUSY; - list_del(&table->list); - nf_tables_table_notify(skb, nlh, table, NFT_MSG_DELTABLE, family); - kfree(table); - module_put(afi->owner); + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + + list_del_rcu(&table->list); return 0; } +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ + BUG_ON(ctx->table->use > 0); + + kfree(ctx->table); + module_put(ctx->afi->owner); +} + int nft_register_chain_type(const struct nf_chain_type *ctype) { int err = 0; @@ -541,7 +631,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_NUL_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, }; @@ -554,13 +644,20 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) { struct nft_stats *cpu_stats, total; struct nlattr *nest; + unsigned int seq; + u64 pkts, bytes; int cpu; memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); - total.pkts += cpu_stats->pkts; - total.bytes += cpu_stats->bytes; + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + pkts = cpu_stats->pkts; + bytes = cpu_stats->bytes; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + total.pkts += pkts; + total.bytes += bytes; } nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); if (nest == NULL) @@ -637,21 +734,13 @@ nla_put_failure: return -1; } -static int nf_tables_chain_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - const struct nft_chain *chain, - int event, int family) +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - u32 seq = nlh ? nlh->nlmsg_seq : 0; - bool report; int err; - report = nlh ? nlmsg_report(nlh) : false; - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -659,18 +748,21 @@ static int nf_tables_chain_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, portid, seq, event, 0, family, - table, chain); + err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -685,12 +777,15 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -702,17 +797,19 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NLM_F_MULTI, afi->family, table, chain) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } } done: + rcu_read_unlock(); cb->args[0] = idx; return skb->len; } - static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -740,10 +837,14 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -767,8 +868,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; -static int -nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) { struct nlattr *tb[NFTA_COUNTER_MAX+1]; struct nft_stats __percpu *newstats; @@ -777,14 +877,14 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); if (err < 0) - return err; + return ERR_PTR(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return -EINVAL; + return ERR_PTR(-EINVAL); - newstats = alloc_percpu(struct nft_stats); + newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -793,27 +893,58 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, + struct nft_stats __percpu *newstats) +{ if (chain->stats) { - /* nfnl_lock is held, add some nfnl function for this, later */ struct nft_stats __percpu *oldstats = - rcu_dereference_protected(chain->stats, 1); + nft_dereference(chain->stats); rcu_assign_pointer(chain->stats, newstats); synchronize_rcu(); free_percpu(oldstats); } else rcu_assign_pointer(chain->stats, newstats); +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; } +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ + BUG_ON(chain->use > 0); + + if (chain->flags & NFT_BASE_CHAIN) { + module_put(nft_base_chain(chain)->type->owner); + free_percpu(nft_base_chain(chain)->stats); + kfree(nft_base_chain(chain)); + } else { + kfree(chain); + } +} + static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr * uninitialized_var(name); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct nft_base_chain *basechain = NULL; @@ -823,8 +954,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; + struct nft_stats __percpu *stats; int err; bool create; + struct nft_ctx ctx; create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; @@ -870,6 +1003,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, } if (chain != NULL) { + struct nft_stats *stats = NULL; + struct nft_trans *trans; + + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -883,19 +1021,31 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (!(chain->flags & NFT_BASE_CHAIN)) return -EOPNOTSUPP; - err = nf_tables_counters(nft_base_chain(chain), - nla[NFTA_CHAIN_COUNTERS]); - if (err < 0) - return err; + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) + return PTR_ERR(stats); } - if (nla[NFTA_CHAIN_POLICY]) - nft_base_chain(chain)->policy = policy; + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, + sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; - if (nla[NFTA_CHAIN_HANDLE] && name) - nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + nft_trans_chain_stats(trans) = stats; + nft_trans_chain_update(trans) = true; + + if (nla[NFTA_CHAIN_POLICY]) + nft_trans_chain_policy(trans) = policy; + else + nft_trans_chain_policy(trans) = -1; - goto notify; + if (nla[NFTA_CHAIN_HANDLE] && name) { + nla_strlcpy(nft_trans_chain_name(trans), name, + NFT_CHAIN_MAXNAMELEN); + } + list_add_tail(&trans->list, &net->nft.commit_list); + return 0; } if (table->use == UINT_MAX) @@ -940,23 +1090,21 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return -ENOMEM; if (nla[NFTA_CHAIN_COUNTERS]) { - err = nf_tables_counters(basechain, - nla[NFTA_CHAIN_COUNTERS]); - if (err < 0) { + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); - return err; + return PTR_ERR(stats); } + basechain->stats = stats; } else { - struct nft_stats __percpu *newstats; - - newstats = alloc_percpu(struct nft_stats); - if (newstats == NULL) { + stats = netdev_alloc_pcpu_stats(struct nft_stats); + if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); - return -ENOMEM; + return PTR_ERR(stats); } - rcu_assign_pointer(basechain->stats, newstats); + rcu_assign_pointer(basechain->stats, stats); } basechain->type = type; @@ -993,33 +1141,27 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (!(table->flags & NFT_TABLE_F_DORMANT) && chain->flags & NFT_BASE_CHAIN) { err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); - if (err < 0) { - module_put(basechain->type->owner); - free_percpu(basechain->stats); - kfree(basechain); - return err; - } + if (err < 0) + goto err1; } - list_add_tail(&chain->list, &table->chains); - table->use++; -notify: - nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN, - family); - return 0; -} - -static void nf_tables_rcu_chain_destroy(struct rcu_head *head) -{ - struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); - BUG_ON(chain->use > 0); + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); + if (err < 0) + goto err2; - if (chain->flags & NFT_BASE_CHAIN) { - module_put(nft_base_chain(chain)->type->owner); - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); - } else - kfree(chain); + table->use++; + list_add_tail_rcu(&chain->list, &table->chains); + return 0; +err2: + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +err1: + nf_tables_chain_destroy(chain); + return err; } static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, @@ -1027,11 +1169,13 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nft_ctx ctx; + int err; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -1040,46 +1184,27 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); - - if (!list_empty(&chain->rules)) + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + if (chain->use > 0) return -EBUSY; - list_del(&chain->list); - table->use--; - - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); - - nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_DELCHAIN, - family); + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; - /* Make sure all rule references are gone before this is released */ - call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy); + table->use--; + list_del_rcu(&chain->list); return 0; } -static void nft_ctx_init(struct nft_ctx *ctx, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nft_af_info *afi, - const struct nft_table *table, - const struct nft_chain *chain, - const struct nlattr * const *nla) -{ - ctx->net = sock_net(skb->sk); - ctx->skb = skb; - ctx->nlh = nlh; - ctx->afi = afi; - ctx->table = table; - ctx->chain = chain; - ctx->nla = nla; -} - /* * Expressions */ @@ -1094,7 +1219,10 @@ static void nft_ctx_init(struct nft_ctx *ctx, int nft_register_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&type->list, &nf_tables_expressions); + if (type->family == NFPROTO_UNSPEC) + list_add_tail_rcu(&type->list, &nf_tables_expressions); + else + list_add_rcu(&type->list, &nf_tables_expressions); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1109,40 +1237,50 @@ EXPORT_SYMBOL_GPL(nft_register_expr); void nft_unregister_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&type->list); + list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_expr); -static const struct nft_expr_type *__nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *__nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name)) + if (!nla_strcmp(nla, type->name) && + (!type->family || type->family == family)) return type; } return NULL; } -static const struct nft_expr_type *nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; if (nla == NULL) return ERR_PTR(-EINVAL); - type = __nft_expr_type_get(nla); + type = __nft_expr_type_get(family, nla); if (type != NULL && try_module_get(type->owner)) return type; #ifdef CONFIG_MODULES if (type == NULL) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + + nfnl_unlock(NFNL_SUBSYS_NFTABLES); request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (__nft_expr_type_get(nla)) + if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); } #endif @@ -1193,7 +1331,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); @@ -1244,10 +1382,11 @@ err1: return err; } -static void nf_tables_expr_destroy(struct nft_expr *expr) +static void nf_tables_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) { if (expr->ops->destroy) - expr->ops->destroy(expr); + expr->ops->destroy(ctx, expr); module_put(expr->ops->type->owner); } @@ -1286,6 +1425,8 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, + [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, @@ -1338,6 +1479,10 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, } nla_nest_end(skb, list); + if (rule->ulen && + nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) + goto nla_put_failure; + return nlmsg_end(skb, nlh); nla_put_failure: @@ -1345,22 +1490,15 @@ nla_put_failure: return -1; } -static int nf_tables_rule_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - const struct nft_chain *chain, +static int nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, - int event, u32 flags, int family) + int event) { struct sk_buff *skb; - u32 portid = NETLINK_CB(oskb).portid; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - u32 seq = nlh->nlmsg_seq; - bool report; int err; - report = nlmsg_report(nlh); - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -1368,18 +1506,21 @@ static int nf_tables_rule_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, portid, seq, event, flags, - family, table, chain, rule); + err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain, rule); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -1429,16 +1570,17 @@ static int nf_tables_dump_rules(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - u8 genctr = ACCESS_ONCE(net->nft.genctr); - u8 gencursor = ACCESS_ONCE(net->nft.gencursor); - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { - list_for_each_entry(rule, &chain->rules, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_rule_is_active(net, rule)) goto cont; if (idx < s_idx) @@ -1452,6 +1594,8 @@ static int nf_tables_dump_rules(struct sk_buff *skb, NLM_F_MULTI | NLM_F_APPEND, afi->family, table, chain, rule) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } @@ -1459,9 +1603,7 @@ cont: } } done: - /* Invalidate this dump, a transition to the new generation happened */ - if (gencursor != net->nft.gencursor || genctr != net->nft.genctr) - return -EBUSY; + rcu_read_unlock(); cb->args[0] = idx; return skb->len; @@ -1495,10 +1637,14 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); if (IS_ERR(chain)) return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) @@ -1521,9 +1667,9 @@ err: return err; } -static void nf_tables_rcu_rule_destroy(struct rcu_head *head) +static void nf_tables_rule_destroy(const struct nft_ctx *ctx, + struct nft_rule *rule) { - struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head); struct nft_expr *expr; /* @@ -1532,55 +1678,46 @@ static void nf_tables_rcu_rule_destroy(struct rcu_head *head) */ expr = nft_expr_first(rule); while (expr->ops && expr != nft_expr_last(rule)) { - nf_tables_expr_destroy(expr); + nf_tables_expr_destroy(ctx, expr); expr = nft_expr_next(expr); } kfree(rule); } -static void nf_tables_rule_destroy(struct nft_rule *rule) +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) { - call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy); -} + struct nft_trans *trans; -#define NFT_RULE_MAXEXPRS 128 + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; -static struct nft_expr_info *info; + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); -static struct nft_rule_trans * -nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx) -{ - struct nft_rule_trans *rupd; - - rupd = kmalloc(sizeof(struct nft_rule_trans), GFP_KERNEL); - if (rupd == NULL) - return NULL; + return trans; +} - rupd->chain = ctx->chain; - rupd->table = ctx->table; - rupd->rule = rule; - rupd->family = ctx->afi->family; - rupd->nlh = ctx->nlh; - list_add_tail(&rupd->list, &ctx->net->nft.commit_list); +#define NFT_RULE_MAXEXPRS 128 - return rupd; -} +static struct nft_expr_info *info; static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; - struct nft_rule_trans *repl = NULL; + struct nft_trans *trans = NULL; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n; + unsigned int size, i, n, ulen = 0; int err, rem; bool create; u64 handle, pos_handle; @@ -1615,6 +1752,9 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); + + if (chain->use == UINT_MAX) + return -EOVERFLOW; } if (nla[NFTA_RULE_POSITION]) { @@ -1646,8 +1786,11 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, } } + if (nla[NFTA_RULE_USERDATA]) + ulen = nla_len(nla[NFTA_RULE_USERDATA]); + err = -ENOMEM; - rule = kzalloc(sizeof(*rule) + size, GFP_KERNEL); + rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); if (rule == NULL) goto err1; @@ -1655,6 +1798,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, rule->handle = handle; rule->dlen = size; + rule->ulen = ulen; + + if (ulen) + nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); expr = nft_expr_first(rule); for (i = 0; i < n; i++) { @@ -1667,13 +1814,15 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_rule_is_active_next(net, old_rule)) { - repl = nf_tables_trans_add(old_rule, &ctx); - if (repl == NULL) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, + old_rule); + if (trans == NULL) { err = -ENOMEM; goto err2; } nft_rule_disactivate_next(net, old_rule); - list_add_tail(&rule->list, &old_rule->list); + chain->use--; + list_add_tail_rcu(&rule->list, &old_rule->list); } else { err = -ENOENT; goto err2; @@ -1690,22 +1839,23 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, list_add_rcu(&rule->list, &chain->rules); } - if (nf_tables_trans_add(rule, &ctx) == NULL) { + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { err = -ENOMEM; goto err3; } + chain->use++; return 0; err3: list_del_rcu(&rule->list); - if (repl) { - list_del_rcu(&repl->rule->list); - list_del(&repl->list); - nft_rule_clear(net, repl->rule); - kfree(repl); + if (trans) { + list_del_rcu(&nft_trans_rule(trans)->list); + nft_rule_clear(net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + chain->use++; } err2: - nf_tables_rule_destroy(rule); + nf_tables_rule_destroy(&ctx, rule); err1: for (i = 0; i < n; i++) { if (info[i].ops != NULL) @@ -1719,9 +1869,10 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ if (nft_rule_is_active_next(ctx->net, rule)) { - if (nf_tables_trans_add(rule, ctx) == NULL) + if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) return -ENOMEM; nft_rule_disactivate_next(ctx->net, rule); + ctx->chain->use--; return 0; } return -ENOENT; @@ -1745,9 +1896,9 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); - const struct nft_table *table; + struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; int family = nfmsg->nfgen_family, err = 0; @@ -1760,6 +1911,8 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; if (nla[NFTA_RULE_CHAIN]) { chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); @@ -1792,75 +1945,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return err; } -static int nf_tables_commit(struct sk_buff *skb) -{ - struct net *net = sock_net(skb->sk); - struct nft_rule_trans *rupd, *tmp; - - /* Bump generation counter, invalidate any dump in progress */ - net->nft.genctr++; - - /* A new generation has just started */ - net->nft.gencursor = gencursor_next(net); - - /* Make sure all packets have left the previous generation before - * purging old rules. - */ - synchronize_rcu(); - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete this rule from the dirty list */ - list_del(&rupd->list); - - /* This rule was inactive in the past and just became active. - * Clear the next bit of the genmask since its meaning has - * changed, now it is the future. - */ - if (nft_rule_is_active(net, rupd->rule)) { - nft_rule_clear(net, rupd->rule); - nf_tables_rule_notify(skb, rupd->nlh, rupd->table, - rupd->chain, rupd->rule, - NFT_MSG_NEWRULE, 0, - rupd->family); - kfree(rupd); - continue; - } - - /* This rule is in the past, get rid of it */ - list_del_rcu(&rupd->rule->list); - nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain, - rupd->rule, NFT_MSG_DELRULE, 0, - rupd->family); - nf_tables_rule_destroy(rupd->rule); - kfree(rupd); - } - - return 0; -} - -static int nf_tables_abort(struct sk_buff *skb) -{ - struct net *net = sock_net(skb->sk); - struct nft_rule_trans *rupd, *tmp; - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete all rules from the dirty list */ - list_del(&rupd->list); - - if (!nft_rule_is_active_next(net, rupd->rule)) { - nft_rule_clear(net, rupd->rule); - kfree(rupd); - continue; - } - - /* This rule is inactive, get rid of it */ - list_del_rcu(&rupd->rule->list); - nf_tables_rule_destroy(rupd->rule); - kfree(rupd); - } - return 0; -} - /* * Sets */ @@ -1870,7 +1954,7 @@ static LIST_HEAD(nf_tables_set_ops); int nft_register_set(struct nft_set_ops *ops) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&ops->list, &nf_tables_set_ops); + list_add_tail_rcu(&ops->list, &nf_tables_set_ops); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1879,14 +1963,23 @@ EXPORT_SYMBOL_GPL(nft_register_set); void nft_unregister_set(struct nft_set_ops *ops) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&ops->list); + list_del_rcu(&ops->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_set); -static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const nla[]) +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], + const struct nft_set_desc *desc, + enum nft_set_policies policy) { - const struct nft_set_ops *ops; + const struct nft_set_ops *ops, *bops; + struct nft_set_estimate est, best; u32 features; #ifdef CONFIG_MODULES @@ -1904,26 +1997,64 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const features &= NFT_SET_INTERVAL | NFT_SET_MAP; } - // FIXME: implement selection properly + bops = NULL; + best.size = ~0; + best.class = ~0; + list_for_each_entry(ops, &nf_tables_set_ops, list) { if ((ops->features & features) != features) continue; + if (!ops->estimate(desc, features, &est)) + continue; + + switch (policy) { + case NFT_SET_POL_PERFORMANCE: + if (est.class < best.class) + break; + if (est.class == best.class && est.size < best.size) + break; + continue; + case NFT_SET_POL_MEMORY: + if (est.size < best.size) + break; + if (est.size == best.size && est.class < best.class) + break; + continue; + default: + break; + } + if (!try_module_get(ops->owner)) continue; - return ops; + if (bops != NULL) + module_put(bops->owner); + + bops = ops; + best = est; } + if (bops != NULL) + return bops; + return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_NAME] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, + [NFTA_SET_POLICY] = { .type = NLA_U32 }, + [NFTA_SET_DESC] = { .type = NLA_NESTED }, + [NFTA_SET_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { + [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, @@ -1933,8 +2064,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, { struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi = NULL; - const struct nft_table *table = NULL; + struct nft_af_info *afi = NULL; + struct nft_table *table = NULL; if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); @@ -1943,9 +2074,14 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, } if (nla[NFTA_SET_TABLE] != NULL) { + if (afi == NULL) + return -EAFNOSUPPORT; + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; } nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); @@ -1967,13 +2103,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + struct nft_trans *trans; + u32 id = ntohl(nla_get_be32(nla)); + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET && + id == nft_trans_set_id(trans)) + return nft_trans_set(trans); + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) { const struct nft_set *i; const char *p; unsigned long *inuse; - unsigned int n = 0; + unsigned int n = 0, min = 0; p = strnchr(name, IFNAMSIZ, '%'); if (p != NULL) { @@ -1983,23 +2133,28 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; - +cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; if (!sscanf(i->name, name, &tmp)) continue; - if (tmp < 0 || tmp > BITS_PER_LONG * PAGE_SIZE) + if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) continue; - set_bit(tmp, inuse); + set_bit(tmp - min, inuse); } - n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE); + n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); + if (n >= BITS_PER_BYTE * PAGE_SIZE) { + min += BITS_PER_BYTE * PAGE_SIZE; + memset(inuse, 0, PAGE_SIZE); + goto cont; + } free_page((unsigned long)inuse); } - snprintf(set->name, sizeof(set->name), name, n); + snprintf(set->name, sizeof(set->name), name, min + n); list_for_each_entry(i, &ctx->table->sets, list) { if (!strcmp(set->name, i->name)) return -ENFILE; @@ -2012,8 +2167,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, { struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - u32 portid = NETLINK_CB(ctx->skb).portid; - u32 seq = ctx->nlh->nlmsg_seq; + struct nlattr *desc; + u32 portid = ctx->portid; + u32 seq = ctx->seq; event |= NFNL_SUBSYS_NFTABLES << 8; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), @@ -2045,6 +2201,14 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + desc = nla_nest_start(skb, NFTA_SET_DESC); + if (desc == NULL) + goto nla_put_failure; + if (set->size && + nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + goto nla_put_failure; + nla_nest_end(skb, desc); + return nlmsg_end(skb, nlh); nla_put_failure: @@ -2054,19 +2218,18 @@ nla_put_failure: static int nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, - int event) + int event, gfp_t gfp_flags) { struct sk_buff *skb; - u32 portid = NETLINK_CB(ctx->skb).portid; - bool report; + u32 portid = ctx->portid; int err; - report = nlmsg_report(ctx->nlh); - if (!report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; - skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; @@ -2076,8 +2239,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, + ctx->report, gfp_flags); err: if (err < 0) nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); @@ -2093,7 +2256,10 @@ static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(set, &ctx->table->sets, list) { + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, @@ -2101,11 +2267,13 @@ static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[0] = idx; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } @@ -2119,7 +2287,10 @@ static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(table, &ctx->afi->tables, list) { + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(table, &ctx->afi->tables, list) { if (cur_table) { if (cur_table != table) continue; @@ -2128,7 +2299,7 @@ static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, } ctx->table = table; idx = 0; - list_for_each_entry(set, &ctx->table->sets, list) { + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, @@ -2137,12 +2308,14 @@ static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[2] = (unsigned long) table; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } @@ -2151,7 +2324,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); int cur_family = cb->args[3]; @@ -2159,7 +2332,10 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (cur_family) { if (afi->family != cur_family) continue; @@ -2167,7 +2343,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, cur_family = 0; } - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { if (cur_table) { if (cur_table != table) continue; @@ -2178,7 +2354,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, ctx->table = table; ctx->afi = afi; idx = 0; - list_for_each_entry(set, &ctx->table->sets, list) { + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, @@ -2189,6 +2365,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[3] = afi->family; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } @@ -2198,6 +2375,7 @@ cont: } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } @@ -2228,6 +2406,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) return ret; } +#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ + static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2257,6 +2437,8 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) @@ -2273,13 +2455,50 @@ err: return err; } +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, + struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *da[NFTA_SET_DESC_MAX + 1]; + int err; + + err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); + if (err < 0) + return err; + + if (da[NFTA_SET_DESC_SIZE] != NULL) + desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + + return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; @@ -2287,14 +2506,18 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, char name[IFNAMSIZ]; unsigned int size; bool create; - u32 ktype, klen, dlen, dtype, flags; + u32 ktype, dtype, flags, policy; + struct nft_set_desc desc; int err; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || - nla[NFTA_SET_KEY_LEN] == NULL) + nla[NFTA_SET_KEY_LEN] == NULL || + nla[NFTA_SET_ID] == NULL) return -EINVAL; + memset(&desc, 0, sizeof(desc)); + ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); @@ -2302,8 +2525,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } - klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); - if (klen == 0 || klen > FIELD_SIZEOF(struct nft_data, data)) + desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) return -EINVAL; flags = 0; @@ -2315,7 +2538,6 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } dtype = 0; - dlen = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; @@ -2328,15 +2550,25 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; - dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); - if (dlen == 0 || - dlen > FIELD_SIZEOF(struct nft_data, data)) + desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); + if (desc.dlen == 0 || + desc.dlen > FIELD_SIZEOF(struct nft_data, data)) return -EINVAL; } else - dlen = sizeof(struct nft_data); + desc.dlen = sizeof(struct nft_data); } else if (flags & NFT_SET_MAP) return -EINVAL; + policy = NFT_SET_POL_PERFORMANCE; + if (nla[NFTA_SET_POLICY] != NULL) + policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + + if (nla[NFTA_SET_DESC] != NULL) { + err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + if (err < 0) + return err; + } + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); @@ -2367,7 +2599,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(nla); + ops = nft_select_set_ops(nla, &desc, policy); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -2388,17 +2620,22 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&set->bindings); set->ops = ops; set->ktype = ktype; - set->klen = klen; + set->klen = desc.klen; set->dtype = dtype; - set->dlen = dlen; + set->dlen = desc.dlen; set->flags = flags; + set->size = desc.size; + + err = ops->init(set, &desc, nla); + if (err < 0) + goto err2; - err = ops->init(set, nla); + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) goto err2; - list_add_tail(&set->list, &table->sets); - nf_tables_set_notify(&ctx, set, NFT_MSG_NEWSET); + list_add_tail_rcu(&set->list, &table->sets); + table->use++; return 0; err2: @@ -2408,17 +2645,20 @@ err1: return err; } -static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +static void nft_set_destroy(struct nft_set *set) { - list_del(&set->list); - if (!(set->flags & NFT_SET_ANONYMOUS)) - nf_tables_set_notify(ctx, set, NFT_MSG_DELSET); - set->ops->destroy(set); module_put(set->ops->owner); kfree(set); } +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ + list_del_rcu(&set->list); + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + nft_set_destroy(set); +} + static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2428,6 +2668,8 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int err; + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; @@ -2435,16 +2677,20 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (err < 0) return err; - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) - return -EAFNOSUPPORT; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; if (!list_empty(&set->bindings)) return -EBUSY; - nf_tables_set_destroy(&ctx, set); + err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del_rcu(&set->list); + ctx.table->use--; return 0; } @@ -2495,16 +2741,17 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } bind: binding->chain = ctx->chain; - list_add_tail(&binding->list, &set->bindings); + list_add_tail_rcu(&binding->list, &set->bindings); return 0; } void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { - list_del(&binding->list); + list_del_rcu(&binding->list); - if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) + if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && + !(set->flags & NFT_SET_INACTIVE)) nf_tables_set_destroy(ctx, set); } @@ -2522,16 +2769,18 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + bool trans) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; - const struct nft_table *table; + struct nft_af_info *afi; + struct nft_table *table; struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); @@ -2541,6 +2790,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (!trans && (table->flags & NFT_TABLE_INACTIVE)) + return -ENOENT; nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); return 0; @@ -2614,13 +2865,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla); + err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, + false); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; event = NFT_MSG_NEWSETELEM; event |= NFNL_SUBSYS_NFTABLES << 8; @@ -2633,7 +2887,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_UNSPEC; + nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; @@ -2677,13 +2931,15 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -2694,7 +2950,98 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, return -EOPNOTSUPP; } -static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, +static int nf_tables_fill_setelem_info(struct sk_buff *skb, + const struct nft_ctx *ctx, u32 seq, + u32 portid, int event, u16 flags, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + int err; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + err = nf_tables_fill_setelem(skb, set, elem); + if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) +{ + struct net *net = ctx->net; + u32 portid = ctx->portid; + struct sk_buff *skb; + int err; + + if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, + set, elem); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); +err: + if (err < 0) + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, + int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + if (trans == NULL) + return NULL; + + nft_trans_elem_set(trans) = set; + return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; @@ -2702,8 +3049,12 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct nft_set_binding *binding; enum nft_registers dreg; + struct nft_trans *trans; int err; + if (set->size && set->nelems == set->size) + return -ENFILE; + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy); if (err < 0) @@ -2723,6 +3074,9 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_DATA] == NULL && !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; + if (nla[NFTA_SET_ELEM_DATA] != NULL && + elem.flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; @@ -2753,7 +3107,7 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, struct nft_ctx bind_ctx = { .afi = ctx->afi, .table = ctx->table, - .chain = binding->chain, + .chain = (struct nft_chain *)binding->chain, }; err = nft_validate_data_load(&bind_ctx, dreg, @@ -2763,12 +3117,20 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, } } + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); + if (trans == NULL) + goto err3; + err = set->ops->insert(set, &elem); if (err < 0) - goto err3; + goto err4; + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; +err4: + kfree(trans); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_uninit(&elem.data, d2.type); @@ -2782,35 +3144,46 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err; + int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); - if (IS_ERR(set)) - return PTR_ERR(set); + if (IS_ERR(set)) { + if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { + set = nf_tables_set_lookup_byid(net, + nla[NFTA_SET_ELEM_LIST_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr); if (err < 0) - return err; + break; + + set->nelems++; } - return 0; + return err; } -static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set, +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_trans *trans; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -2834,7 +3207,12 @@ static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err2; - set->ops->remove(set, &elem); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); + if (trans == NULL) + goto err2; + + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); nft_data_uninit(&elem.key, NFT_DATA_VALUE); if (set->flags & NFT_SET_MAP) @@ -2853,9 +3231,9 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err; + int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; @@ -2868,14 +3246,16 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); if (err < 0) - return err; + break; + + set->nelems--; } - return 0; + return err; } static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { - .call = nf_tables_newtable, + .call_batch = nf_tables_newtable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, @@ -2885,12 +3265,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { - .call = nf_tables_deltable, + .call_batch = nf_tables_deltable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { - .call = nf_tables_newchain, + .call_batch = nf_tables_newchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -2900,7 +3280,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { - .call = nf_tables_delchain, + .call_batch = nf_tables_delchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -2920,7 +3300,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { - .call = nf_tables_newset, + .call_batch = nf_tables_newset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, @@ -2930,12 +3310,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { - .call = nf_tables_delset, + .call_batch = nf_tables_delset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { - .call = nf_tables_newsetelem, + .call_batch = nf_tables_newsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, @@ -2945,12 +3325,282 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { - .call = nf_tables_delsetelem, + .call_batch = nf_tables_delsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, }; +static void nft_chain_commit_update(struct nft_trans *trans) +{ + struct nft_base_chain *basechain; + + if (nft_trans_chain_name(trans)[0]) + strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + + if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) + return; + + basechain = nft_base_chain(trans->ctx.chain); + nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + + switch (nft_trans_chain_policy(trans)) { + case NF_DROP: + case NF_ACCEPT: + basechain->policy = nft_trans_chain_policy(trans); + break; + } +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_DELTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_DELRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_DELSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + /* Bump generation counter, invalidate any dump in progress */ + while (++net->nft.base_seq == 0); + + /* A new generation has just started */ + net->nft.gencursor = gencursor_next(net); + + /* Make sure all packets have left the previous generation before + * purging old rules. + */ + synchronize_rcu(); + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (!nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + } else { + trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + } + nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELTABLE: + nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) + nft_chain_commit_update(trans); + else + trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + break; + case NFT_MSG_NEWRULE: + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_NEWRULE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELRULE: + list_del_rcu(&nft_trans_rule(trans)->list); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_DELRULE); + break; + case NFT_MSG_NEWSET: + nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_NEWSET, GFP_KERNEL); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET, GFP_KERNEL); + break; + case NFT_MSG_NEWSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_NEWSETELEM, 0); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_DELSETELEM, 0); + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + } + + return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_NEWCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_NEWRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_NEWSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + nft_trans_destroy(trans); + } else { + list_del_rcu(&trans->ctx.table->list); + } + break; + case NFT_MSG_DELTABLE: + list_add_tail_rcu(&trans->ctx.table->list, + &trans->ctx.afi->tables); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) { + if (nft_trans_chain_stats(trans)) + free_percpu(nft_trans_chain_stats(trans)); + + nft_trans_destroy(trans); + } else { + trans->ctx.table->use--; + list_del_rcu(&trans->ctx.chain->list); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + } + break; + case NFT_MSG_DELCHAIN: + trans->ctx.table->use++; + list_add_tail_rcu(&trans->ctx.chain->list, + &trans->ctx.table->chains); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWRULE: + trans->ctx.chain->use--; + list_del_rcu(&nft_trans_rule(trans)->list); + break; + case NFT_MSG_DELRULE: + trans->ctx.chain->use++; + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSET: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_set(trans)->list); + break; + case NFT_MSG_DELSET: + trans->ctx.table->use++; + list_add_tail_rcu(&nft_trans_set(trans)->list, + &trans->ctx.table->sets); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSETELEM: + nft_trans_elem_set(trans)->nelems--; + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nft_trans_elem_set(trans)->nelems++; + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe_reverse(trans, next, + &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + } + + return 0; +} + static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, @@ -2977,6 +3627,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, const struct nft_set_elem *elem) { + if (elem->flags & NFT_SET_ELEM_INTERVAL_END) + return 0; + switch (elem->data.verdict) { case NFT_JUMP: case NFT_GOTO: @@ -3151,9 +3804,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict) { - case NF_ACCEPT: - case NF_DROP: - case NF_QUEUE: + default: + switch (data->verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; + default: + return -EINVAL; + } + /* fall through */ case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: @@ -3174,8 +3834,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->chain = chain; desc->len = sizeof(data); break; - default: - return -EINVAL; } desc->type = NFT_DATA_VERDICT; @@ -3330,6 +3988,7 @@ static int nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.af_info); INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; return 0; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 0d879fcb876..3b90eb2b2c5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,8 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1]) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - u32 mask; + u32 mask = nft_cmp_fast_mask(priv->len); - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - priv->len); if ((data[priv->sreg].data[0] & mask) == priv->data) return; data[NFT_REG_VERDICT].verdict = NFT_BREAK; @@ -67,20 +66,6 @@ struct nft_jumpstack { int rulenum; }; -static inline void -nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt, - struct nft_jumpstack *jumpstack, unsigned int stackptr) -{ - struct nft_stats __percpu *stats; - const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this; - - rcu_read_lock_bh(); - stats = rcu_dereference(nft_base_chain(chain)->stats); - __this_cpu_inc(stats->pkts); - __this_cpu_add(stats->bytes, pkt->skb->len); - rcu_read_unlock_bh(); -} - enum nft_trace { NFT_TRACE_RULE, NFT_TRACE_RETURN, @@ -103,9 +88,9 @@ static struct nf_loginfo trace_loginfo = { }, }; -static inline void nft_trace_packet(const struct nft_pktinfo *pkt, - const struct nft_chain *chain, - int rulenum, enum nft_trace type) +static void nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) { struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); @@ -118,13 +103,14 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt, unsigned int nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { - const struct nft_chain *chain = ops->priv; + const struct nft_chain *chain = ops->priv, *basechain = chain; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_data data[NFT_REG_MAX + 1]; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - int rulenum = 0; + struct nft_stats *stats; + int rulenum; /* * Cache cursor to avoid problems in case that the cursor is updated * while traversing the ruleset. @@ -132,6 +118,7 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); do_chain: + rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); next_rule: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; @@ -157,8 +144,10 @@ next_rule: switch (data[NFT_REG_VERDICT].verdict) { case NFT_BREAK: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; - /* fall through */ + continue; case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); continue; } break; @@ -184,37 +173,46 @@ next_rule: jumpstack[stackptr].rule = rule; jumpstack[stackptr].rulenum = rulenum; stackptr++; - /* fall through */ + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; case NFT_GOTO: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + chain = data[NFT_REG_VERDICT].chain; goto do_chain; case NFT_RETURN: if (unlikely(pkt->skb->nf_trace)) nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); - - /* fall through */ + break; case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) + nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); break; default: WARN_ON(1); } if (stackptr > 0) { - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); - stackptr--; chain = jumpstack[stackptr].chain; rule = jumpstack[stackptr].rule; rulenum = jumpstack[stackptr].rulenum; goto next_rule; } - nft_chain_stats(chain, pkt, jumpstack, stackptr); if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_POLICY); + nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + + rcu_read_lock_bh(); + stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); + rcu_read_unlock_bh(); - return nft_base_chain(chain)->policy; + return nft_base_chain(basechain)->policy; } EXPORT_SYMBOL_GPL(nft_do_chain); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 046aa13b4fe..c138b8fbe28 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -61,6 +61,14 @@ void nfnl_unlock(__u8 subsys_id) } EXPORT_SYMBOL_GPL(nfnl_unlock); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(u8 subsys_id) +{ + return lockdep_is_held(&table[subsys_id].mutex); +} +EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); +#endif + int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { nfnl_lock(n->subsys_id); @@ -248,15 +256,15 @@ replay: #endif { nfnl_unlock(subsys_id); - kfree_skb(nskb); - return netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(nskb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - kfree_skb(nskb); - return netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(skb); } while (skb->len >= nlmsg_total_size(0)) { @@ -360,14 +368,13 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - struct net *net = sock_net(skb->sk); int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { netlink_ack(skb, nlh, -EPERM); return; } @@ -392,19 +399,17 @@ static void nfnetlink_rcv(struct sk_buff *skb) } #ifdef CONFIG_MODULES -static void nfnetlink_bind(int group) +static int nfnetlink_bind(int group) { const struct nfnetlink_subsystem *ss; int type = nfnl_group2type[group]; rcu_read_lock(); ss = nfnetlink_get_subsys(type); - if (!ss) { - rcu_read_unlock(); - request_module("nfnetlink-subsys-%d", type); - return; - } rcu_read_unlock(); + if (!ss) + request_module("nfnetlink-subsys-%d", type); + return 0; } #endif diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c7b6d466a66..2baa125c2e8 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -32,18 +32,24 @@ static LIST_HEAD(nfnl_acct_list); struct nf_acct { atomic64_t pkts; atomic64_t bytes; + unsigned long flags; struct list_head head; atomic_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; + char data[0]; }; +#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) + static int nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; char *acct_name; + unsigned int size = 0; + u32 flags = 0; if (!tb[NFACCT_NAME]) return -EINVAL; @@ -68,15 +74,38 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); + smp_mb__before_atomic(); + /* reset overquota flag if quota is enabled. */ + if ((matching->flags & NFACCT_F_QUOTA)) + clear_bit(NFACCT_F_OVERQUOTA, &matching->flags); return 0; } return -EBUSY; } - nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL); + if (tb[NFACCT_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); + if (flags & ~NFACCT_F_QUOTA) + return -EOPNOTSUPP; + if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) + return -EINVAL; + if (flags & NFACCT_F_OVERQUOTA) + return -EINVAL; + + size += sizeof(u64); + } + + nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); if (nfacct == NULL) return -ENOMEM; + if (flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)nfacct->data; + + *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); + nfacct->flags = flags; + } + strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { @@ -117,6 +146,9 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (type == NFNL_MSG_ACCT_GET_CTRZERO) { pkts = atomic64_xchg(&acct->pkts, 0); bytes = atomic64_xchg(&acct->bytes, 0); + smp_mb__before_atomic(); + if (acct->flags & NFACCT_F_QUOTA) + clear_bit(NFACCT_F_OVERQUOTA, &acct->flags); } else { pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); @@ -125,7 +157,13 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) goto nla_put_failure; + if (acct->flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)acct->data; + if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || + nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) + goto nla_put_failure; + } nlmsg_end(skb, nlh); return skb->len; @@ -270,6 +308,8 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, [NFACCT_BYTES] = { .type = NLA_U64 }, [NFACCT_PKTS] = { .type = NLA_U64 }, + [NFACCT_FLAGS] = { .type = NLA_U32 }, + [NFACCT_QUOTA] = { .type = NLA_U64 }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { @@ -336,6 +376,50 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_update); +static void nfnl_overquota_report(struct nf_acct *nfacct) +{ + int ret; + struct sk_buff *skb; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + return; + + ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, + nfacct); + if (ret <= 0) { + kfree_skb(skb); + return; + } + netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, + GFP_ATOMIC); +} + +int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +{ + u64 now; + u64 *quota; + int ret = NFACCT_UNDERQUOTA; + + /* no place here if we don't have a quota */ + if (!(nfacct->flags & NFACCT_F_QUOTA)) + return NFACCT_NO_QUOTA; + + quota = (u64 *)nfacct->data; + now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? + atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); + + ret = now > *quota; + + if (now >= *quota && + !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { + nfnl_overquota_report(nfacct); + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfnl_acct_overquota); + static int __init nfnl_acct_init(void) { int ret; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index a155d19a225..d292c8d286e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -28,8 +28,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> -#include <linux/jhash.h> -#include <linux/random.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> @@ -75,7 +73,6 @@ struct nfulnl_instance { }; #define INSTANCE_BUCKETS 16 -static unsigned int hash_init; static int nfnl_log_net_id __read_mostly; @@ -1067,11 +1064,6 @@ static int __init nfnetlink_log_init(void) { int status = -ENOMEM; - /* it's not really all that important to have a random value, so - * we can do this from the init function, even if there hasn't - * been that much entropy yet */ - get_random_bytes(&hash_init, sizeof(hash_init)); - netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index f072fe80351..108120f216b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -354,13 +354,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, GFP_ATOMIC); - if (!skb) + if (!skb) { + skb_tx_error(entskb); return NULL; + } nlh = nlmsg_put(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg), 0); if (!nlh) { + skb_tx_error(entskb); kfree_skb(skb); return NULL; } @@ -488,13 +491,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); - skb_zerocopy(skb, entskb, data_len, hlen); + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; } nlh->nlmsg_len = skb->len; return skb; nla_put_failure: + skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); return NULL; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 954925db414..e2b3f51c81f 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -128,7 +128,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, BUG_ON(err < 0); desc.len *= BITS_PER_BYTE; - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - desc.len); + mask = nft_cmp_fast_mask(desc.len); priv->data = data.data[0] & mask; priv->len = desc.len; return 0; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 82cb8236f8a..1840989092e 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -192,9 +192,18 @@ err: } static void -nft_target_destroy(const struct nft_expr *expr) +nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgdtor_param par; + + par.net = ctx->net; + par.target = target; + par.targinfo = info; + par.family = ctx->afi->family; + if (par.target->destroy != NULL) + par.target->destroy(&par); module_put(target->me); } @@ -379,9 +388,18 @@ err: } static void -nft_match_destroy(const struct nft_expr *expr) +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_mtdtor_param par; + + par.net = ctx->net; + par.match = match; + par.matchinfo = info; + par.family = ctx->afi->family; + if (par.match->destroy != NULL) + par.match->destroy(&par); module_put(match->me); } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 917052e2060..cc560301624 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -19,15 +19,15 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> struct nft_ct { enum nft_ct_keys key:8; enum ip_conntrack_dir dir:8; - union{ + union { enum nft_registers dreg:8; enum nft_registers sreg:8; }; - uint8_t family; }; static void nft_ct_get_eval(const struct nft_expr *expr, @@ -97,6 +97,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr, goto err; strncpy((char *)dest->data, helper->name, sizeof(dest->data)); return; +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: { + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int size; + + if (!labels) { + memset(dest->data, 0, sizeof(dest->data)); + return; + } + + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); + size = labels->words * sizeof(long); + + memcpy(dest->data, labels->bits, size); + if (size < sizeof(dest->data)) + memset(((char *) dest->data) + size, 0, + sizeof(dest->data) - size); + return; + } +#endif } tuple = &ct->tuplehash[priv->dir].tuple; @@ -195,22 +215,14 @@ static void nft_ct_l3proto_module_put(uint8_t family) nf_ct_l3proto_module_put(family); } -static int nft_ct_init_validate_get(const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_ct_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + int err; - if (tb[NFTA_CT_DIRECTION] != NULL) { - priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); - switch (priv->dir) { - case IP_CT_DIR_ORIGINAL: - case IP_CT_DIR_REPLY: - break; - default: - return -EINVAL; - } - } - + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { case NFT_CT_STATE: case NFT_CT_DIRECTION: @@ -221,11 +233,15 @@ static int nft_ct_init_validate_get(const struct nft_expr *expr, #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: +#endif case NFT_CT_EXPIRATION: case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; break; + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: @@ -238,69 +254,66 @@ static int nft_ct_init_validate_get(const struct nft_expr *expr, return -EOPNOTSUPP; } - return 0; -} - -static int nft_ct_init_validate_set(uint32_t key) -{ - switch (key) { - case NFT_CT_MARK: - break; - default: - return -EOPNOTSUPP; + if (tb[NFTA_CT_DIRECTION] != NULL) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } } + priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + return 0; } -static int nft_ct_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_ct_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); - - if (tb[NFTA_CT_DREG]) { - err = nft_ct_init_validate_get(expr, tb); - if (err < 0) - return err; - - priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - err = nft_validate_data_load(ctx, priv->dreg, NULL, - NFT_DATA_VALUE); - if (err < 0) - return err; - } else { - err = nft_ct_init_validate_set(priv->key); - if (err < 0) - return err; - - priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); - err = nft_validate_input_register(priv->sreg); - if (err < 0) - return err; + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + break; +#endif + default: + return -EOPNOTSUPP; } - err = nft_ct_l3proto_try_module_get(ctx->afi->family); + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); + err = nft_validate_input_register(priv->sreg); if (err < 0) return err; - priv->family = ctx->afi->family; + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; return 0; } -static void nft_ct_destroy(const struct nft_expr *expr) +static void nft_ct_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { - struct nft_ct *priv = nft_expr_priv(expr); - - nft_ct_l3proto_module_put(priv->family); + nft_ct_l3proto_module_put(ctx->afi->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -311,8 +324,19 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; - if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) - goto nla_put_failure; + + switch (priv->key) { + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + default: + break; + } + return 0; nla_put_failure: @@ -338,7 +362,7 @@ static const struct nft_expr_ops nft_ct_get_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, - .init = nft_ct_init, + .init = nft_ct_get_init, .destroy = nft_ct_destroy, .dump = nft_ct_get_dump, }; @@ -347,7 +371,7 @@ static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, - .init = nft_ct_init, + .init = nft_ct_set_init, .destroy = nft_ct_destroy, .dump = nft_ct_set_dump, }; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 3d3f8fce10a..4080ed6a072 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -12,23 +12,36 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/jhash.h> #include <linux/netlink.h> +#include <linux/vmalloc.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#define NFT_HASH_MIN_SIZE 4UL + struct nft_hash { - struct hlist_head *hash; - unsigned int hsize; + struct nft_hash_table __rcu *tbl; +}; + +struct nft_hash_table { + unsigned int size; + struct nft_hash_elem __rcu *buckets[]; }; struct nft_hash_elem { - struct hlist_node hnode; - struct nft_data key; - struct nft_data data[]; + struct nft_hash_elem __rcu *next; + struct nft_data key; + struct nft_data data[]; }; +#define nft_hash_for_each_entry(i, head) \ + for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) +#define nft_hash_for_each_entry_rcu(i, head) \ + for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) + static u32 nft_hash_rnd __read_mostly; static bool nft_hash_rnd_initted __read_mostly; @@ -38,7 +51,7 @@ static unsigned int nft_hash_data(const struct nft_data *data, unsigned int h; h = jhash(data->data, len, nft_hash_rnd); - return ((u64)h * hsize) >> 32; + return h & (hsize - 1); } static bool nft_hash_lookup(const struct nft_set *set, @@ -46,11 +59,12 @@ static bool nft_hash_lookup(const struct nft_set *set, struct nft_data *data) { const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); const struct nft_hash_elem *he; unsigned int h; - h = nft_hash_data(key, priv->hsize, set->klen); - hlist_for_each_entry(he, &priv->hash[h], hnode) { + h = nft_hash_data(key, tbl->size, set->klen); + nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { if (nft_data_cmp(&he->key, key, set->klen)) continue; if (set->flags & NFT_SET_MAP) @@ -60,19 +74,148 @@ static bool nft_hash_lookup(const struct nft_set *set, return false; } -static void nft_hash_elem_destroy(const struct nft_set *set, - struct nft_hash_elem *he) +static void nft_hash_tbl_free(const struct nft_hash_table *tbl) { - nft_data_uninit(&he->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) - nft_data_uninit(he->data, set->dtype); - kfree(he); + kvfree(tbl); +} + +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ + return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + +static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) +{ + struct nft_hash_table *tbl; + size_t size; + + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); + if (tbl == NULL) + tbl = vzalloc(size); + if (tbl == NULL) + return NULL; + tbl->size = nbuckets; + + return tbl; +} + +static void nft_hash_chain_unzip(const struct nft_set *set, + const struct nft_hash_table *ntbl, + struct nft_hash_table *tbl, unsigned int n) +{ + struct nft_hash_elem *he, *last, *next; + unsigned int h; + + he = nft_dereference(tbl->buckets[n]); + if (he == NULL) + return; + h = nft_hash_data(&he->key, ntbl->size, set->klen); + + /* Find last element of first chain hashing to bucket h */ + last = he; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + break; + last = he; + } + + /* Unlink first chain from the old table */ + RCU_INIT_POINTER(tbl->buckets[n], last->next); + + /* If end of chain reached, done */ + if (he == NULL) + return; + + /* Find first element of second chain hashing to bucket h */ + next = NULL; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + continue; + next = he; + break; + } + + /* Link the two chains */ + RCU_INIT_POINTER(last->next, next); +} + +static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem *he; + unsigned int i, h; + bool complete; + + ntbl = nft_hash_tbl_alloc(tbl->size * 2); + if (ntbl == NULL) + return -ENOMEM; + + /* Link new table's buckets to first element in the old table + * hashing to the new bucket. + */ + for (i = 0; i < ntbl->size; i++) { + h = i < tbl->size ? i : i - tbl->size; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) + continue; + RCU_INIT_POINTER(ntbl->buckets[i], he); + break; + } + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + + /* Unzip interleaved hash chains */ + do { + /* Wait for readers to use new table/unzipped chains */ + synchronize_rcu(); + + complete = true; + for (i = 0; i < tbl->size; i++) { + nft_hash_chain_unzip(set, ntbl, tbl, i); + if (tbl->buckets[i] != NULL) + complete = false; + } + } while (!complete); + + nft_hash_tbl_free(tbl); + return 0; +} + +static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem __rcu **pprev; + unsigned int i; + + ntbl = nft_hash_tbl_alloc(tbl->size / 2); + if (ntbl == NULL) + return -ENOMEM; + + for (i = 0; i < ntbl->size; i++) { + ntbl->buckets[i] = tbl->buckets[i]; + + for (pprev = &ntbl->buckets[i]; *pprev != NULL; + pprev = &nft_dereference(*pprev)->next) + ; + RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + synchronize_rcu(); + + nft_hash_tbl_free(tbl); + return 0; } static int nft_hash_insert(const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); struct nft_hash_elem *he; unsigned int size, h; @@ -91,33 +234,64 @@ static int nft_hash_insert(const struct nft_set *set, if (set->flags & NFT_SET_MAP) nft_data_copy(he->data, &elem->data); - h = nft_hash_data(&he->key, priv->hsize, set->klen); - hlist_add_head_rcu(&he->hnode, &priv->hash[h]); + h = nft_hash_data(&he->key, tbl->size, set->klen); + RCU_INIT_POINTER(he->next, tbl->buckets[h]); + rcu_assign_pointer(tbl->buckets[h], he); + + /* Expand table when exceeding 75% load */ + if (set->nelems + 1 > tbl->size / 4 * 3) + nft_hash_tbl_expand(set, priv); + return 0; } +static void nft_hash_elem_destroy(const struct nft_set *set, + struct nft_hash_elem *he) +{ + nft_data_uninit(&he->key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP) + nft_data_uninit(he->data, set->dtype); + kfree(he); +} + static void nft_hash_remove(const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_hash_elem *he = elem->cookie; + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, __rcu **pprev; + + pprev = elem->cookie; + he = nft_dereference((*pprev)); - hlist_del_rcu(&he->hnode); + RCU_INIT_POINTER(*pprev, he->next); + synchronize_rcu(); kfree(he); + + /* Shrink table beneath 30% load */ + if (set->nelems - 1 < tbl->size * 3 / 10 && + tbl->size > NFT_HASH_MIN_SIZE) + nft_hash_tbl_shrink(set, priv); } static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) { const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem __rcu * const *pprev; struct nft_hash_elem *he; unsigned int h; - h = nft_hash_data(&elem->key, priv->hsize, set->klen); - hlist_for_each_entry(he, &priv->hash[h], hnode) { - if (nft_data_cmp(&he->key, &elem->key, set->klen)) + h = nft_hash_data(&elem->key, tbl->size, set->klen); + pprev = &tbl->buckets[h]; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_data_cmp(&he->key, &elem->key, set->klen)) { + pprev = &he->next; continue; + } - elem->cookie = he; - elem->flags = 0; + elem->cookie = (void *)pprev; + elem->flags = 0; if (set->flags & NFT_SET_MAP) nft_data_copy(&elem->data, he->data); return 0; @@ -129,12 +303,13 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_iter *iter) { const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); const struct nft_hash_elem *he; struct nft_set_elem elem; unsigned int i; - for (i = 0; i < priv->hsize; i++) { - hlist_for_each_entry(he, &priv->hash[i], hnode) { + for (i = 0; i < tbl->size; i++) { + nft_hash_for_each_entry(he, tbl->buckets[i]) { if (iter->count < iter->skip) goto cont; @@ -158,50 +333,77 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) } static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const tb[]) { struct nft_hash *priv = nft_set_priv(set); - unsigned int cnt, i; + struct nft_hash_table *tbl; + unsigned int size; if (unlikely(!nft_hash_rnd_initted)) { get_random_bytes(&nft_hash_rnd, 4); nft_hash_rnd_initted = true; } - /* Aim for a load factor of 0.75 */ - // FIXME: temporarily broken until we have set descriptions - cnt = 100; - cnt = cnt * 4 / 3; + size = NFT_HASH_MIN_SIZE; + if (desc->size) + size = nft_hash_tbl_size(desc->size); - priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL); - if (priv->hash == NULL) + tbl = nft_hash_tbl_alloc(size); + if (tbl == NULL) return -ENOMEM; - priv->hsize = cnt; - - for (i = 0; i < cnt; i++) - INIT_HLIST_HEAD(&priv->hash[i]); - + RCU_INIT_POINTER(priv->tbl, tbl); return 0; } static void nft_hash_destroy(const struct nft_set *set) { const struct nft_hash *priv = nft_set_priv(set); - const struct hlist_node *next; - struct nft_hash_elem *elem; + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, *next; unsigned int i; - for (i = 0; i < priv->hsize; i++) { - hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) { - hlist_del(&elem->hnode); - nft_hash_elem_destroy(set, elem); + for (i = 0; i < tbl->size; i++) { + for (he = nft_dereference(tbl->buckets[i]); he != NULL; + he = next) { + next = nft_dereference(he->next); + nft_hash_elem_destroy(set, he); } } - kfree(priv->hash); + kfree(tbl); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int esize; + + esize = sizeof(struct nft_hash_elem); + if (features & NFT_SET_MAP) + esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + + if (desc->size) { + est->size = sizeof(struct nft_hash) + + nft_hash_tbl_size(desc->size) * + sizeof(struct nft_hash_elem *) + + desc->size * esize; + } else { + /* Resizing happens when the load drops below 30% or goes + * above 75%. The average of 52.5% load (approximated by 50%) + * is used for the size estimation of the hash buckets, + * meaning we calculate two buckets per element. + */ + est->size = esize + 2 * sizeof(struct nft_hash_elem *); + } + + est->class = NFT_SET_CLASS_O_1; + + return true; } static struct nft_set_ops nft_hash_ops __read_mostly = { .privsize = nft_hash_privsize, + .estimate = nft_hash_estimate, .init = nft_hash_init, .destroy = nft_hash_destroy, .get = nft_hash_get, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index f169501f1ad..810385eb724 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -70,7 +70,8 @@ err1: return err; } -static void nft_immediate_destroy(const struct nft_expr *expr) +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 5af790123ad..10cfb156cdf 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -23,7 +23,6 @@ static const char *nft_log_null_prefix = ""; struct nft_log { struct nf_loginfo loginfo; char *prefix; - int family; }; static void nft_log_eval(const struct nft_expr *expr, @@ -33,7 +32,7 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_log *priv = nft_expr_priv(expr); struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, priv->family, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, pkt->out, &priv->loginfo, "%s", priv->prefix); } @@ -52,8 +51,6 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; - priv->family = ctx->afi->family; - nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); @@ -77,7 +74,8 @@ static int nft_log_init(const struct nft_ctx *ctx, return 0; } -static void nft_log_destroy(const struct nft_expr *expr) +static void nft_log_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_log *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 8a6116b75b5..6404a726d17 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -16,6 +16,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_lookup { struct nft_set *set; @@ -55,8 +56,14 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); - if (IS_ERR(set)) - return PTR_ERR(set); + if (IS_ERR(set)) { + if (tb[NFTA_LOOKUP_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_LOOKUP_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); err = nft_validate_input_register(priv->sreg); @@ -88,11 +95,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return 0; } -static void nft_lookup_destroy(const struct nft_expr *expr) +static void nft_lookup_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); - nf_tables_unbind_set(NULL, priv->set, &priv->binding); + nf_tables_unbind_set(ctx, priv->set, &priv->binding); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e8254ad2e5a..852b178c6ae 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -18,18 +18,11 @@ #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - -static void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; @@ -116,7 +109,7 @@ static void nft_meta_get_eval(const struct nft_expr *expr, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); break; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: { const struct dst_entry *dst = skb_dst(skb); @@ -140,10 +133,11 @@ static void nft_meta_get_eval(const struct nft_expr *expr, err: data[NFT_REG_VERDICT].verdict = NFT_BREAK; } +EXPORT_SYMBOL_GPL(nft_meta_get_eval); -static void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -163,28 +157,24 @@ static void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } +EXPORT_SYMBOL_GPL(nft_meta_set_eval); -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(nft_meta_policy); -static int nft_meta_init_validate_set(uint32_t key) +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { - switch (key) { - case NFT_META_MARK: - case NFT_META_PRIORITY: - case NFT_META_NFTRACE: - return 0; - default: - return -EOPNOTSUPP; - } -} + struct nft_meta *priv = nft_expr_priv(expr); + int err; -static int nft_meta_init_validate_get(uint32_t key) -{ - switch (key) { + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { case NFT_META_LEN: case NFT_META_PROTOCOL: case NFT_META_NFPROTO: @@ -199,45 +189,47 @@ static int nft_meta_init_validate_get(uint32_t key) case NFT_META_OIFTYPE: case NFT_META_SKUID: case NFT_META_SKGID: -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif - return 0; + break; default: return -EOPNOTSUPP; } + priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; } +EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); int err; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); - - if (tb[NFTA_META_DREG]) { - err = nft_meta_init_validate_get(priv->key); - if (err < 0) - return err; - - priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - return nft_validate_data_load(ctx, priv->dreg, NULL, - NFT_DATA_VALUE); + switch (priv->key) { + case NFT_META_MARK: + case NFT_META_PRIORITY: + case NFT_META_NFTRACE: + break; + default: + return -EOPNOTSUPP; } - err = nft_meta_init_validate_set(priv->key); - if (err < 0) - return err; - priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); err = nft_validate_input_register(priv->sreg); if (err < 0) @@ -245,9 +237,10 @@ static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } +EXPORT_SYMBOL_GPL(nft_meta_set_init); -static int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -260,9 +253,10 @@ static int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_get_dump); -static int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -276,13 +270,14 @@ static int nft_meta_set_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_set_dump); static struct nft_expr_type nft_meta_type; static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_get_eval, - .init = nft_meta_init, + .init = nft_meta_get_init, .dump = nft_meta_get_dump, }; @@ -290,7 +285,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_set_eval, - .init = nft_meta_init, + .init = nft_meta_set_init, .dump = nft_meta_set_dump, }; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index d3b1ffe2618..79ff58cd36d 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -31,8 +31,8 @@ struct nft_nat { enum nft_registers sreg_addr_max:8; enum nft_registers sreg_proto_min:8; enum nft_registers sreg_proto_max:8; - int family; - enum nf_nat_manip_type type; + enum nf_nat_manip_type type:8; + u8 family; }; static void nft_nat_eval(const struct nft_expr *expr, @@ -88,6 +88,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); + u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL) @@ -107,9 +108,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; - priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (priv->family != AF_INET && priv->family != AF_INET6) - return -EINVAL; + family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + if (family != ctx->afi->family) + return -EOPNOTSUPP; + priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { priv->sreg_addr_min = ntohl(nla_get_be32( @@ -171,12 +175,14 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be32(skb, NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min))) - goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max))) - goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } return 0; nla_put_failure: @@ -202,13 +208,7 @@ static struct nft_expr_type nft_nat_type __read_mostly = { static int __init nft_nat_module_init(void) { - int err; - - err = nft_register_expr(&nft_nat_type); - if (err < 0) - return err; - - return 0; + return nft_register_expr(&nft_nat_type); } static void __exit nft_nat_module_exit(void) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a2aeb318678..85daa84bfdf 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -135,7 +135,8 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) return ERR_PTR(-EINVAL); - if (len <= 4 && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER) + if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && + base != NFT_PAYLOAD_LL_HEADER) return &nft_payload_fast_ops; else return &nft_payload_ops; diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index cbea473d69e..e8ae2f6bf23 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -25,7 +25,6 @@ struct nft_queue { u16 queuenum; u16 queues_total; u16 flags; - u8 family; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -43,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr, queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, - priv->queues_total, priv->family, + priv->queues_total, pkt->ops->pf, jhash_initval); } } @@ -71,7 +70,6 @@ static int nft_queue_init(const struct nft_ctx *ctx, return -EINVAL; init_hashrandom(&jhash_initval); - priv->family = ctx->afi->family; priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); if (tb[NFTA_QUEUE_TOTAL] != NULL) diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index ca0c1b231bf..e1836ff8819 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -18,6 +18,8 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +static DEFINE_SPINLOCK(nft_rbtree_lock); + struct nft_rbtree { struct rb_root root; }; @@ -38,6 +40,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const struct rb_node *parent = priv->root.rb_node; int d; + spin_lock_bh(&nft_rbtree_lock); while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -53,6 +56,8 @@ found: goto out; if (set->flags & NFT_SET_MAP) nft_data_copy(data, rbe->data); + + spin_unlock_bh(&nft_rbtree_lock); return true; } } @@ -62,6 +67,7 @@ found: goto found; } out: + spin_unlock_bh(&nft_rbtree_lock); return false; } @@ -69,8 +75,10 @@ static void nft_rbtree_elem_destroy(const struct nft_set *set, struct nft_rbtree_elem *rbe) { nft_data_uninit(&rbe->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_uninit(rbe->data, set->dtype); + kfree(rbe); } @@ -108,7 +116,8 @@ static int nft_rbtree_insert(const struct nft_set *set, int err; size = sizeof(*rbe); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) size += sizeof(rbe->data[0]); rbe = kzalloc(size, GFP_KERNEL); @@ -117,12 +126,16 @@ static int nft_rbtree_insert(const struct nft_set *set, rbe->flags = elem->flags; nft_data_copy(&rbe->key, &elem->key); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(rbe->data, &elem->data); + spin_lock_bh(&nft_rbtree_lock); err = __nft_rbtree_insert(set, rbe); if (err < 0) kfree(rbe); + + spin_unlock_bh(&nft_rbtree_lock); return err; } @@ -132,7 +145,9 @@ static void nft_rbtree_remove(const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe = elem->cookie; + spin_lock_bh(&nft_rbtree_lock); rb_erase(&rbe->node, &priv->root); + spin_unlock_bh(&nft_rbtree_lock); kfree(rbe); } @@ -143,6 +158,7 @@ static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) struct nft_rbtree_elem *rbe; int d; + spin_lock_bh(&nft_rbtree_lock); while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -153,12 +169,15 @@ static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) parent = parent->rb_right; else { elem->cookie = rbe; - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(&elem->data, rbe->data); elem->flags = rbe->flags; + spin_unlock_bh(&nft_rbtree_lock); return 0; } } + spin_unlock_bh(&nft_rbtree_lock); return -ENOENT; } @@ -171,22 +190,27 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_set_elem elem; struct rb_node *node; + spin_lock_bh(&nft_rbtree_lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { if (iter->count < iter->skip) goto cont; rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_data_copy(&elem.key, &rbe->key); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(&elem.data, rbe->data); elem.flags = rbe->flags; iter->err = iter->fn(ctx, set, iter, &elem); - if (iter->err < 0) + if (iter->err < 0) { + spin_unlock_bh(&nft_rbtree_lock); return; + } cont: iter->count++; } + spin_unlock_bh(&nft_rbtree_lock); } static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) @@ -195,6 +219,7 @@ static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) } static int nft_rbtree_init(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_rbtree *priv = nft_set_priv(set); @@ -209,15 +234,37 @@ static void nft_rbtree_destroy(const struct nft_set *set) struct nft_rbtree_elem *rbe; struct rb_node *node; + spin_lock_bh(&nft_rbtree_lock); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_rbtree_elem_destroy(set, rbe); } + spin_unlock_bh(&nft_rbtree_lock); +} + +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int nsize; + + nsize = sizeof(struct nft_rbtree_elem); + if (features & NFT_SET_MAP) + nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + + if (desc->size) + est->size = sizeof(struct nft_rbtree) + desc->size * nsize; + else + est->size = nsize; + + est->class = NFT_SET_CLASS_O_LOG_N; + + return true; } static struct nft_set_ops nft_rbtree_ops __read_mostly = { .privsize = nft_rbtree_privsize, + .estimate = nft_rbtree_estimate, .init = nft_rbtree_init, .destroy = nft_rbtree_destroy, .insert = nft_rbtree_insert, diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 5e204711d70..f3448c29644 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -16,65 +16,23 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> -#include <net/icmp.h> -#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) -#include <net/netfilter/ipv6/nf_reject.h> -#endif - -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; - u8 family; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) -{ - struct nft_reject *priv = nft_expr_priv(expr); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); -#endif - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (priv->family == NFPROTO_IPV4) - nf_send_unreach(pkt->skb, priv->icmp_code); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - else if (priv->family == NFPROTO_IPV6) - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); -#endif - break; - case NFT_REJECT_TCP_RST: - if (priv->family == NFPROTO_IPV4) - nf_send_reset(pkt->skb, pkt->ops->hooknum); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - else if (priv->family == NFPROTO_IPV6) - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); -#endif - break; - } - - data[NFT_REG_VERDICT].verdict = NF_DROP; -} - -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; +EXPORT_SYMBOL_GPL(nft_reject_policy); -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; - priv->family = ctx->afi->family; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: @@ -89,8 +47,9 @@ static int nft_reject_init(const struct nft_ctx *ctx, return 0; } +EXPORT_SYMBOL_GPL(nft_reject_init); -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_reject *priv = nft_expr_priv(expr); @@ -109,37 +68,7 @@ static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) nla_put_failure: return -1; } - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, -}; - -static struct nft_expr_type nft_reject_type __read_mostly = { - .name = "reject", - .ops = &nft_reject_ops, - .policy = nft_reject_policy, - .maxattr = NFTA_REJECT_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_reject_module_init(void) -{ - return nft_register_expr(&nft_reject_type); -} - -static void __exit nft_reject_module_exit(void) -{ - nft_unregister_expr(&nft_reject_type); -} - -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); +EXPORT_SYMBOL_GPL(nft_reject_dump); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("reject"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 00000000000..b718a52a465 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + return nft_reject_ipv4_eval(expr, data, pkt); + case NFPROTO_IPV6: + return nft_reject_ipv6_eval(expr, data, pkt); + } +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { + .type = &nft_reject_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_inet_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "reject", + .ops = &nft_reject_inet_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ + return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ + nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 3228d7f24eb..4973cbddc44 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -146,11 +146,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) if (par->family == NFPROTO_BRIDGE) { switch (eth_hdr(skb)->h_proto) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): audit_ip4(ab, skb); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): audit_ip6(ab, skb); break; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 5929be622c5..75747aecdeb 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -228,12 +228,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err3; } - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); - - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &par->net->ct.tmpl); + nf_conntrack_tmpl_insert(par->net, ct); out: info->ct = ct; return 0; diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 12d4da8e6c7..bbffdbdaf60 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -23,10 +23,11 @@ MODULE_ALIAS("ip6t_bpf"); static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; - struct sock_fprog program; + struct sock_fprog_kern program; program.len = info->bpf_program_num_elem; - program.filter = (struct sock_filter __user *) info->bpf_program; + program.filter = info->bpf_program; + if (sk_unattached_filter_create(&info->filter, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 9a8e77e7f8d..f4e83300532 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -54,7 +54,8 @@ static struct xt_match cgroup_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_cgroup_info), .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), }; static int __init cgroup_mt_init(void) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index c40b2695633..fbc66bb250d 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -19,6 +19,7 @@ #include <linux/jhash.h> #include <linux/slab.h> #include <linux/list.h> +#include <linux/rbtree.h> #include <linux/module.h> #include <linux/random.h> #include <linux/skbuff.h> @@ -31,6 +32,16 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + +#define CONNLIMIT_GC_MAX_NODES 8 + /* we will save the tuples of all connections we care about */ struct xt_connlimit_conn { struct hlist_node node; @@ -38,16 +49,27 @@ struct xt_connlimit_conn { union nf_inet_addr addr; }; +struct xt_connlimit_rb { + struct rb_node node; + struct hlist_head hhead; /* connections/hosts in same subnet */ + union nf_inet_addr addr; /* search key */ +}; + +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + struct xt_connlimit_data { - struct hlist_head iphash[256]; - spinlock_t lock; + struct rb_root climit_root4[CONNLIMIT_SLOTS]; + struct rb_root climit_root6[CONNLIMIT_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; +static struct kmem_cache *connlimit_rb_cachep __read_mostly; +static struct kmem_cache *connlimit_conn_cachep __read_mostly; static inline unsigned int connlimit_iphash(__be32 addr) { - return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; + return jhash_1word((__force __u32)addr, + connlimit_rnd) % CONNLIMIT_SLOTS; } static inline unsigned int @@ -60,7 +82,8 @@ connlimit_iphash6(const union nf_inet_addr *addr, for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) res.ip6[i] = addr->ip6[i] & mask->ip6[i]; - return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; + return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), + connlimit_rnd) % CONNLIMIT_SLOTS; } static inline bool already_closed(const struct nf_conn *conn) @@ -72,13 +95,14 @@ static inline bool already_closed(const struct nf_conn *conn) return 0; } -static inline unsigned int +static int same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, const union nf_inet_addr *u3, u_int8_t family) { if (family == NFPROTO_IPV4) { - return (addr->ip & mask->ip) == (u3->ip & mask->ip); + return ntohl(addr->ip & mask->ip) - + ntohl(u3->ip & mask->ip); } else { union nf_inet_addr lh, rh; unsigned int i; @@ -88,89 +112,205 @@ same_source_net(const union nf_inet_addr *addr, rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; } - return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; + return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)); } } -static int count_them(struct net *net, - struct xt_connlimit_data *data, +static bool add_hlist(struct hlist_head *head, const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - const union nf_inet_addr *mask, - u_int8_t family) + const union nf_inet_addr *addr) +{ + struct xt_connlimit_conn *conn; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return false; + conn->tuple = *tuple; + conn->addr = *addr; + hlist_add_head(&conn->node, head); + return true; +} + +static unsigned int check_hlist(struct net *net, + struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + bool *addit) { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; struct hlist_node *n; struct nf_conn *found_ct; - struct hlist_head *hash; - bool addit = true; - int matches = 0; - - if (family == NFPROTO_IPV6) - hash = &data->iphash[connlimit_iphash6(addr, mask)]; - else - hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; + unsigned int length = 0; + *addit = true; rcu_read_lock(); /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, hash, node) { + hlist_for_each_entry_safe(conn, n, head, node) { found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, &conn->tuple); - found_ct = NULL; + if (found == NULL) { + hlist_del(&conn->node); + kmem_cache_free(connlimit_conn_cachep, conn); + continue; + } - if (found != NULL) - found_ct = nf_ct_tuplehash_to_ctrack(found); + found_ct = nf_ct_tuplehash_to_ctrack(found); - if (found_ct != NULL && - nf_ct_tuple_equal(&conn->tuple, tuple) && - !already_closed(found_ct)) + if (nf_ct_tuple_equal(&conn->tuple, tuple)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". */ - addit = false; - - if (found == NULL) { - /* this one is gone */ - hlist_del(&conn->node); - kfree(conn); - continue; - } - - if (already_closed(found_ct)) { + *addit = false; + } else if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); hlist_del(&conn->node); - kfree(conn); + kmem_cache_free(connlimit_conn_cachep, conn); continue; } - if (same_source_net(addr, mask, &conn->addr, family)) - /* same source network -> be counted! */ - ++matches; nf_ct_put(found_ct); + length++; } rcu_read_unlock(); - if (addit) { - /* save the new connection in our list */ - conn = kmalloc(sizeof(*conn), GFP_ATOMIC); - if (conn == NULL) - return -ENOMEM; - conn->tuple = *tuple; - conn->addr = *addr; - hlist_add_head(&conn->node, hash); - ++matches; + return length; +} + +static void tree_nodes_free(struct rb_root *root, + struct xt_connlimit_rb *gc_nodes[], + unsigned int gc_count) +{ + struct xt_connlimit_rb *rbconn; + + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + rb_erase(&rbconn->node, root); + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, const union nf_inet_addr *mask, + u8 family) +{ + struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct xt_connlimit_rb *rbconn; + struct xt_connlimit_conn *conn; + unsigned int gc_count; + bool no_gc = false; + + restart: + gc_count = 0; + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + bool addit; + + rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + + parent = *rbnode; + diff = same_source_net(addr, mask, &rbconn->addr, family); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* same source network -> be counted! */ + unsigned int count; + count = check_hlist(net, &rbconn->hhead, tuple, &addit); + + tree_nodes_free(root, gc_nodes, gc_count); + if (!addit) + return count; + + if (!add_hlist(&rbconn->hhead, tuple, addr)) + return 0; /* hotdrop */ + + return count + 1; + } + + if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + /* only used for GC on hhead, retval and 'addit' ignored */ + check_hlist(net, &rbconn->hhead, tuple, &addit); + if (hlist_empty(&rbconn->hhead)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + no_gc = true; + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + goto restart; + } + + /* no match, need to insert new node */ + rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + return 0; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(connlimit_rb_cachep, rbconn); + return 0; + } + + conn->tuple = *tuple; + conn->addr = *addr; + rbconn->addr = *addr; + + INIT_HLIST_HEAD(&rbconn->hhead); + hlist_add_head(&conn->node, &rbconn->hhead); + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + return 1; +} + +static int count_them(struct net *net, + struct xt_connlimit_data *data, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + u_int8_t family) +{ + struct rb_root *root; + int count; + u32 hash; + + if (family == NFPROTO_IPV6) { + hash = connlimit_iphash6(addr, mask); + root = &data->climit_root6[hash]; + } else { + hash = connlimit_iphash(addr->ip & mask->ip); + root = &data->climit_root4[hash]; } - return matches; + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + count = count_tree(net, root, tuple, addr, mask, family); + + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + return count; } static bool @@ -183,7 +323,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct nf_conntrack_tuple *tuple_ptr = &tuple; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; - int connections; + unsigned int connections; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) @@ -202,12 +342,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) iph->daddr : iph->saddr; } - spin_lock_bh(&info->data->lock); connections = count_them(net, info->data, tuple_ptr, &addr, &info->mask, par->family); - spin_unlock_bh(&info->data->lock); - - if (connections < 0) + if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; @@ -247,29 +384,44 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - spin_lock_init(&info->data->lock); - for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) - INIT_HLIST_HEAD(&info->data->iphash[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + info->data->climit_root4[i] = RB_ROOT; + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + info->data->climit_root6[i] = RB_ROOT; return 0; } -static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +static void destroy_tree(struct rb_root *r) { - const struct xt_connlimit_info *info = par->matchinfo; struct xt_connlimit_conn *conn; + struct xt_connlimit_rb *rbconn; struct hlist_node *n; - struct hlist_head *hash = info->data->iphash; + struct rb_node *node; + + while ((node = rb_first(r)) != NULL) { + rbconn = container_of(node, struct xt_connlimit_rb, node); + + rb_erase(node, r); + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) + kmem_cache_free(connlimit_conn_cachep, conn); + + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_connlimit_info *info = par->matchinfo; unsigned int i; nf_ct_l3proto_module_put(par->family); - for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { - hlist_for_each_entry_safe(conn, n, &hash[i], node) { - hlist_del(&conn->node); - kfree(conn); - } - } + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + destroy_tree(&info->data->climit_root4[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + destroy_tree(&info->data->climit_root6[i]); kfree(info->data); } @@ -287,12 +439,40 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + int ret, i; + + BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); + BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", + sizeof(struct xt_connlimit_conn), + 0, 0, NULL); + if (!connlimit_conn_cachep) + return -ENOMEM; + + connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", + sizeof(struct xt_connlimit_rb), + 0, 0, NULL); + if (!connlimit_rb_cachep) { + kmem_cache_destroy(connlimit_conn_cachep); + return -ENOMEM; + } + ret = xt_register_match(&connlimit_mt_reg); + if (ret != 0) { + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); + } + return ret; } static void __exit connlimit_mt_exit(void) { xt_unregister_match(&connlimit_mt_reg); + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c index a4c7561698c..89d53104c6b 100644 --- a/net/netfilter/xt_ipcomp.c +++ b/net/netfilter/xt_ipcomp.c @@ -60,7 +60,7 @@ static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) } return spi_match(compinfo->spis[0], compinfo->spis[1], - ntohl(chdr->cpi << 16), + ntohs(chdr->cpi), !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); } diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index b3be0ef21f1..8c646ed9c92 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -21,11 +21,14 @@ MODULE_ALIAS("ip6t_nfacct"); static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) { + int overquota; const struct xt_nfacct_match_info *info = par->targinfo; nfnl_acct_update(skb, info->nfacct); - return true; + overquota = nfnl_acct_overquota(skb, info->nfacct); + + return overquota == NFACCT_UNDERQUOTA ? false : true; } static int diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 7174611bd67..c529161cdbf 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -422,4 +422,6 @@ module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 1e657cf715c..a9faae89f95 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -313,10 +313,7 @@ out: static void recent_table_free(void *addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } static int recent_mt_check(const struct xt_mtchk_param *par, diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 6efe4e5a81c..8fd324116e6 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -5,23 +5,35 @@ * they serve as the hanging-off data accessed through repl.data[]. */ +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + * struct type##_replace repl; + * struct type##_standard entries[nhooks]; + * struct type##_error term; + * } *tbl; + */ + #define xt_alloc_initial_table(type, typ2) ({ \ unsigned int hook_mask = info->valid_hooks; \ unsigned int nhooks = hweight32(hook_mask); \ unsigned int bytes = 0, hooknum = 0, i = 0; \ struct { \ struct type##_replace repl; \ - struct type##_standard entries[nhooks]; \ - struct type##_error term; \ - } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ + struct type##_standard entries[]; \ + } *tbl; \ + struct type##_error *term; \ + size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ + __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ + tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \ if (tbl == NULL) \ return NULL; \ + term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ - tbl->term = (struct type##_error)typ2##_ERROR_INIT; \ + *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ - sizeof(struct type##_error); \ + sizeof(struct type##_error); \ for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ if (!(hook_mask & 1)) \ continue; \ |
