diff options
Diffstat (limited to 'net/netlink')
| -rw-r--r-- | net/netlink/Kconfig | 19 | ||||
| -rw-r--r-- | net/netlink/Makefile | 3 | ||||
| -rw-r--r-- | net/netlink/af_netlink.c | 1767 | ||||
| -rw-r--r-- | net/netlink/af_netlink.h | 87 | ||||
| -rw-r--r-- | net/netlink/diag.c | 227 | ||||
| -rw-r--r-- | net/netlink/genetlink.c | 814 |
6 files changed, 2231 insertions, 686 deletions
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig new file mode 100644 index 00000000000..2c5e95e9bfb --- /dev/null +++ b/net/netlink/Kconfig @@ -0,0 +1,19 @@ +# +# Netlink Sockets +# + +config NETLINK_MMAP + bool "NETLINK: mmaped IO" + ---help--- + This option enables support for memory mapped netlink IO. This + reduces overhead by avoiding copying data between kernel- and + userspace. + + If unsure, say N. + +config NETLINK_DIAG + tristate "NETLINK: socket monitoring interface" + default n + ---help--- + Support for NETLINK socket monitoring interface used by the ss tool. + If unsure, say Y. diff --git a/net/netlink/Makefile b/net/netlink/Makefile index bdd6ddf4e95..e837917f6c0 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -3,3 +3,6 @@ # obj-y := af_netlink.o genetlink.o + +obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o +netlink_diag-y := diag.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 478181d53c5..e6fac7e3db5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3,6 +3,7 @@ * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -55,111 +56,869 @@ #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <linux/if_arp.h> +#include <asm/cacheflush.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> -#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) -#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) - -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 pid; - u32 dst_pid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - struct module *module; -}; +#include "af_netlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[0]; }; +/* state bits */ +#define NETLINK_CONGESTED 0x0 + +/* flags */ #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_BROADCAST_SEND_ERROR 0x4 #define NETLINK_RECV_NO_ENOBUFS 0x8 -static inline struct netlink_sock *nlk_sk(struct sock *sk) +static inline int netlink_is_kernel(struct sock *sk) { - return container_of(sk, struct netlink_sock, sk); + return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; } -static inline int netlink_is_kernel(struct sock *sk) +struct netlink_table *nl_table; +EXPORT_SYMBOL_GPL(nl_table); + +static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); + +static int netlink_dump(struct sock *sk); +static void netlink_skb_destructor(struct sk_buff *skb); + +DEFINE_RWLOCK(nl_table_lock); +EXPORT_SYMBOL_GPL(nl_table_lock); +static atomic_t nl_table_users = ATOMIC_INIT(0); + +#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); + +static ATOMIC_NOTIFIER_HEAD(netlink_chain); + +static DEFINE_SPINLOCK(netlink_tap_lock); +static struct list_head netlink_tap_all __read_mostly; + +static inline u32 netlink_group_mask(u32 group) { - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; + return group ? 1 << (group - 1) : 0; } -struct nl_pid_hash { - struct hlist_head *table; - unsigned long rehash_time; +static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u32 portid) +{ + return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; +} - unsigned int mask; - unsigned int shift; +int netlink_add_tap(struct netlink_tap *nt) +{ + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) + return -EINVAL; - unsigned int entries; - unsigned int max_shift; + spin_lock(&netlink_tap_lock); + list_add_rcu(&nt->list, &netlink_tap_all); + spin_unlock(&netlink_tap_lock); - u32 rnd; -}; + if (nt->module) + __module_get(nt->module); -struct netlink_table { - struct nl_pid_hash hash; - struct hlist_head mc_list; - struct listeners __rcu *listeners; - unsigned int nl_nonroot; - unsigned int groups; - struct mutex *cb_mutex; - struct module *module; - int registered; + return 0; +} +EXPORT_SYMBOL_GPL(netlink_add_tap); + +static int __netlink_remove_tap(struct netlink_tap *nt) +{ + bool found = false; + struct netlink_tap *tmp; + + spin_lock(&netlink_tap_lock); + + list_for_each_entry(tmp, &netlink_tap_all, list) { + if (nt == tmp) { + list_del_rcu(&nt->list); + found = true; + goto out; + } + } + + pr_warn("__netlink_remove_tap: %p not found\n", nt); +out: + spin_unlock(&netlink_tap_lock); + + if (found && nt->module) + module_put(nt->module); + + return found ? 0 : -ENODEV; +} + +int netlink_remove_tap(struct netlink_tap *nt) +{ + int ret; + + ret = __netlink_remove_tap(nt); + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(netlink_remove_tap); + +static bool netlink_filter_tap(const struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + bool pass = false; + + /* We take the more conservative approach and + * whitelist socket protocols that may pass. + */ + switch (sk->sk_protocol) { + case NETLINK_ROUTE: + case NETLINK_USERSOCK: + case NETLINK_SOCK_DIAG: + case NETLINK_NFLOG: + case NETLINK_XFRM: + case NETLINK_FIB_LOOKUP: + case NETLINK_NETFILTER: + case NETLINK_GENERIC: + pass = true; + break; + } + + return pass; +} + +static int __netlink_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct sk_buff *nskb; + struct sock *sk = skb->sk; + int ret = -ENOMEM; + + dev_hold(dev); + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + nskb->dev = dev; + nskb->protocol = htons((u16) sk->sk_protocol); + nskb->pkt_type = netlink_is_kernel(sk) ? + PACKET_KERNEL : PACKET_USER; + + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + } + + dev_put(dev); + return ret; +} + +static void __netlink_deliver_tap(struct sk_buff *skb) +{ + int ret; + struct netlink_tap *tmp; + + if (!netlink_filter_tap(skb)) + return; + + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + ret = __netlink_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; + } +} + +static void netlink_deliver_tap(struct sk_buff *skb) +{ + rcu_read_lock(); + + if (unlikely(!list_empty(&netlink_tap_all))) + __netlink_deliver_tap(skb); + + rcu_read_unlock(); +} + +static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, + struct sk_buff *skb) +{ + if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) + netlink_deliver_tap(skb); +} + +static void netlink_overrun(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } + } + atomic_inc(&sk->sk_drops); +} + +static void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(NETLINK_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +#ifdef CONFIG_NETLINK_MMAP +static bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +} + +static bool netlink_rx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->rx_ring.pg_vec != NULL; +} + +static bool netlink_tx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->tx_ring.pg_vec != NULL; +} + +static __pure struct page *pgvec_to_page(const void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + else + return virt_to_page(addr); +} + +static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) { + if (pg_vec[i] != NULL) { + if (is_vmalloc_addr(pg_vec[i])) + vfree(pg_vec[i]); + else + free_pages((unsigned long)pg_vec[i], order); + } + } + kfree(pg_vec); +} + +static void *alloc_one_pg_vec_page(unsigned long order) +{ + void *buffer; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | + __GFP_NOWARN | __GFP_NORETRY; + + buffer = (void *)__get_free_pages(gfp_flags, order); + if (buffer != NULL) + return buffer; + + buffer = vzalloc((1 << order) * PAGE_SIZE); + if (buffer != NULL) + return buffer; + + gfp_flags &= ~__GFP_NORETRY; + return (void *)__get_free_pages(gfp_flags, order); +} + +static void **alloc_pg_vec(struct netlink_sock *nlk, + struct nl_mmap_req *req, unsigned int order) +{ + unsigned int block_nr = req->nm_block_nr; + unsigned int i; + void **pg_vec; + + pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); + if (pg_vec == NULL) + return NULL; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = alloc_one_pg_vec_page(order); + if (pg_vec[i] == NULL) + goto err1; + } + + return pg_vec; +err1: + free_pg_vec(pg_vec, order, block_nr); + return NULL; +} + +static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, + bool closing, bool tx_ring) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct sk_buff_head *queue; + void **pg_vec = NULL; + unsigned int order = 0; + int err; + + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + if (!closing) { + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; + } + + if (req->nm_block_nr) { + if (ring->pg_vec != NULL) + return -EBUSY; + + if ((int)req->nm_block_size <= 0) + return -EINVAL; + if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) + return -EINVAL; + if (req->nm_frame_size < NL_MMAP_HDRLEN) + return -EINVAL; + if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) + return -EINVAL; + + ring->frames_per_block = req->nm_block_size / + req->nm_frame_size; + if (ring->frames_per_block == 0) + return -EINVAL; + if (ring->frames_per_block * req->nm_block_nr != + req->nm_frame_nr) + return -EINVAL; + + order = get_order(req->nm_block_size); + pg_vec = alloc_pg_vec(nlk, req, order); + if (pg_vec == NULL) + return -ENOMEM; + } else { + if (req->nm_frame_nr) + return -EINVAL; + } + + err = -EBUSY; + mutex_lock(&nlk->pg_vec_lock); + if (closing || atomic_read(&nlk->mapped) == 0) { + err = 0; + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + } + mutex_unlock(&nlk->pg_vec_lock); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); + return err; +} + +static void netlink_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&nlk_sk(sk)->mapped); +} + +static void netlink_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&nlk_sk(sk)->mapped); +} + +static const struct vm_operations_struct netlink_mmap_ops = { + .open = netlink_mm_open, + .close = netlink_mm_close, }; -static struct netlink_table *nl_table; +static int netlink_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + unsigned long start, size, expected; + unsigned int i; + int err = -EINVAL; -static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); + if (vma->vm_pgoff) + return -EINVAL; -static int netlink_dump(struct sock *sk); -static void netlink_destroy_callback(struct netlink_callback *cb); + mutex_lock(&nlk->pg_vec_lock); -static DEFINE_RWLOCK(nl_table_lock); -static atomic_t nl_table_users = ATOMIC_INIT(0); + expected = 0; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; + } -static ATOMIC_NOTIFIER_HEAD(netlink_chain); + if (expected == 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if (size != expected) + goto out; + + start = vma->vm_start; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; -static u32 netlink_group_mask(u32 group) + for (i = 0; i < ring->pg_vec_len; i++) { + struct page *page; + void *kaddr = ring->pg_vec[i]; + unsigned int pg_num; + + for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { + page = pgvec_to_page(kaddr); + err = vm_insert_page(vma, start, page); + if (err < 0) + goto out; + start += PAGE_SIZE; + kaddr += PAGE_SIZE; + } + } + } + + atomic_inc(&nlk->mapped); + vma->vm_ops = &netlink_mmap_ops; + err = 0; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} + +static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) { - return group ? 1 << (group - 1) : 0; +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + struct page *p_start, *p_end; + + /* First page is flushed through netlink_{get,set}_status */ + p_start = pgvec_to_page(hdr + PAGE_SIZE); + p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; + } +#endif } -static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) +static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) { - return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; + smp_rmb(); + flush_dcache_page(pgvec_to_page(hdr)); + return hdr->nm_status; +} + +static void netlink_set_status(struct nl_mmap_hdr *hdr, + enum nl_mmap_status status) +{ + hdr->nm_status = status; + flush_dcache_page(pgvec_to_page(hdr)); + smp_wmb(); +} + +static struct nl_mmap_hdr * +__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) +{ + unsigned int pg_vec_pos, frame_off; + + pg_vec_pos = pos / ring->frames_per_block; + frame_off = pos % ring->frames_per_block; + + return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); +} + +static struct nl_mmap_hdr * +netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, + enum nl_mmap_status status) +{ + struct nl_mmap_hdr *hdr; + + hdr = __netlink_lookup_frame(ring, pos); + if (netlink_get_status(hdr) != status) + return NULL; + + return hdr; +} + +static struct nl_mmap_hdr * +netlink_current_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + return netlink_lookup_frame(ring, ring->head, status); +} + +static struct nl_mmap_hdr * +netlink_previous_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + unsigned int prev; + + prev = ring->head ? ring->head - 1 : ring->frame_max; + return netlink_lookup_frame(ring, prev, status); +} + +static void netlink_increment_head(struct netlink_ring *ring) +{ + ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; +} + +static void netlink_forward_ring(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) + break; + if (hdr->nm_status != NL_MMAP_STATUS_SKIP) + break; + netlink_increment_head(ring); + } while (ring->head != head); +} + +static bool netlink_dump_space(struct netlink_sock *nlk) +{ + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + unsigned int n; + + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + return false; + + n = ring->head + ring->frame_max / 2; + if (n > ring->frame_max) + n -= ring->frame_max; + + hdr = __netlink_lookup_frame(ring, n); + + return hdr->nm_status == NL_MMAP_STATUS_UNUSED; +} + +static unsigned int netlink_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int mask; + int err; + + if (nlk->rx_ring.pg_vec != NULL) { + /* Memory mapped sockets don't call recvmsg(), so flow control + * for dumps is performed here. A dump is allowed to continue + * if at least half the ring is unused. + */ + while (nlk->cb_running && netlink_dump_space(nlk)) { + err = netlink_dump(sk); + if (err < 0) { + sk->sk_err = -err; + sk->sk_error_report(sk); + break; + } + } + netlink_rcv_wake(sk); + } + + mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + netlink_forward_ring(&nlk->rx_ring); + if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + spin_lock_bh(&sk->sk_write_queue.lock); + if (nlk->tx_ring.pg_vec) { + if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLOUT | POLLWRNORM; + } + spin_unlock_bh(&sk->sk_write_queue.lock); + + return mask; +} + +static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) +{ + return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); +} + +static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, + struct netlink_ring *ring, + struct nl_mmap_hdr *hdr) +{ + unsigned int size; + void *data; + + size = ring->frame_size - NL_MMAP_HDRLEN; + data = (void *)hdr + NL_MMAP_HDRLEN; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + + skb->destructor = netlink_skb_destructor; + NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; + NETLINK_CB(skb).sk = sk; +} + +static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, + u32 dst_portid, u32 dst_group, + struct sock_iocb *siocb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + struct sk_buff *skb; + unsigned int maxlen; + bool excl = true; + int err = 0, len = 0; + + /* Netlink messages are validated by the receiver before processing. + * In order to avoid userspace changing the contents of the message + * after validation, the socket and the ring may only be used by a + * single process, otherwise we fall back to copying. + */ + if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || + atomic_read(&nlk->mapped) > 1) + excl = false; + + mutex_lock(&nlk->pg_vec_lock); + + ring = &nlk->tx_ring; + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + + do { + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); + if (hdr == NULL) { + if (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending)) + schedule(); + continue; + } + if (hdr->nm_len > maxlen) { + err = -EINVAL; + goto out; + } + + netlink_frame_flush_dcache(hdr); + + if (likely(dst_portid == 0 && dst_group == 0 && excl)) { + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + sock_hold(sk); + netlink_ring_setup_skb(skb, sk, ring, hdr); + NETLINK_CB(skb).flags |= NETLINK_SKB_TX; + __skb_put(skb, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + } else { + skb = alloc_skb(hdr->nm_len, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + __skb_put(skb, hdr->nm_len); + memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + } + + netlink_increment_head(ring); + + NETLINK_CB(skb).portid = nlk->portid; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).creds = siocb->scm->creds; + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (unlikely(dst_group)) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_portid, dst_group, + GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_portid, + msg->msg_flags & MSG_DONTWAIT); + if (err < 0) + goto out; + len += err; + + } while (hdr != NULL || + (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending))); + + if (len > 0) + err = len; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} + +static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +{ + struct nl_mmap_hdr *hdr; + + hdr = netlink_mmap_hdr(skb); + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_frame_flush_dcache(hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + + NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; + kfree_skb(skb); +} + +static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + + spin_lock_bh(&sk->sk_receive_queue.lock); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) { + spin_unlock_bh(&sk->sk_receive_queue.lock); + kfree_skb(skb); + netlink_overrun(sk); + return; + } + netlink_increment_head(ring); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_set_status(hdr, NL_MMAP_STATUS_COPY); +} + +#else /* CONFIG_NETLINK_MMAP */ +#define netlink_skb_is_mmaped(skb) false +#define netlink_rx_is_mmaped(sk) false +#define netlink_tx_is_mmaped(sk) false +#define netlink_mmap sock_no_mmap +#define netlink_poll datagram_poll +#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 +#endif /* CONFIG_NETLINK_MMAP */ + +static void netlink_skb_destructor(struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + struct nl_mmap_hdr *hdr; + struct netlink_ring *ring; + struct sock *sk; + + /* If a packet from the kernel to userspace was freed because of an + * error without being delivered to userspace, the kernel must reset + * the status. In the direction userspace to kernel, the status is + * always reset here after the packet was processed and freed. + */ + if (netlink_skb_is_mmaped(skb)) { + hdr = netlink_mmap_hdr(skb); + sk = NETLINK_CB(skb).sk; + + if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + ring = &nlk_sk(sk)->tx_ring; + } else { + if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { + hdr->nm_len = 0; + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + } + ring = &nlk_sk(sk)->rx_ring; + } + + WARN_ON(atomic_read(&ring->pending) == 0); + atomic_dec(&ring->pending); + sock_put(sk); + + skb->head = NULL; + } +#endif + if (is_vmalloc_addr(skb->head)) { + if (!skb->cloned || + !atomic_dec_return(&(skb_shinfo(skb)->dataref))) + vfree(skb->head); + + skb->head = NULL; + } + if (skb->sk != NULL) + sock_rfree(skb); +} + +static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + WARN_ON(skb->sk != NULL); + skb->sk = sk; + skb->destructor = netlink_skb_destructor; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (nlk->cb) { - if (nlk->cb->done) - nlk->cb->done(nlk->cb); - netlink_destroy_callback(nlk->cb); + if (nlk->cb_running) { + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); + + module_put(nlk->cb.module); + kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); +#ifdef CONFIG_NETLINK_MMAP + if (1) { + struct nl_mmap_req req; + + memset(&req, 0, sizeof(req)); + if (nlk->rx_ring.pg_vec) + netlink_set_ring(sk, &req, true, false); + memset(&req, 0, sizeof(req)); + if (nlk->tx_ring.pg_vec) + netlink_set_ring(sk, &req, true, true); + } +#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -226,18 +985,23 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } -static inline struct sock *netlink_lookup(struct net *net, int protocol, - u32 pid) +static bool netlink_compare(struct net *net, struct sock *sk) { - struct nl_pid_hash *hash = &nl_table[protocol].hash; + return net_eq(sock_net(sk), net); +} + +static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) +{ + struct netlink_table *table = &nl_table[protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *sk; - struct hlist_node *node; read_lock(&nl_table_lock); - head = nl_pid_hashfn(hash, pid); - sk_for_each(sk, node, head) { - if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) { + head = nl_portid_hashfn(hash, portid); + sk_for_each(sk, head) { + if (table->compare(net, sk) && + (nlk_sk(sk)->portid == portid)) { sock_hold(sk); goto found; } @@ -248,7 +1012,7 @@ found: return sk; } -static inline struct hlist_head *nl_pid_hash_zalloc(size_t size) +static struct hlist_head *nl_portid_hash_zalloc(size_t size) { if (size <= PAGE_SIZE) return kzalloc(size, GFP_ATOMIC); @@ -258,7 +1022,7 @@ static inline struct hlist_head *nl_pid_hash_zalloc(size_t size) get_order(size)); } -static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) +static void nl_portid_hash_free(struct hlist_head *table, size_t size) { if (size <= PAGE_SIZE) kfree(table); @@ -266,7 +1030,7 @@ static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) free_pages((unsigned long)table, get_order(size)); } -static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) +static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow) { unsigned int omask, mask, shift; size_t osize, size; @@ -284,7 +1048,7 @@ static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) size *= 2; } - table = nl_pid_hash_zalloc(size); + table = nl_portid_hash_zalloc(size); if (!table) return 0; @@ -296,26 +1060,26 @@ static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) for (i = 0; i <= omask; i++) { struct sock *sk; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; - sk_for_each_safe(sk, node, tmp, &otable[i]) - __sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid)); + sk_for_each_safe(sk, tmp, &otable[i]) + __sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid)); } - nl_pid_hash_free(otable, osize); + nl_portid_hash_free(otable, osize); hash->rehash_time = jiffies + 10 * 60 * HZ; return 1; } -static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) +static inline int nl_portid_hash_dilute(struct nl_portid_hash *hash, int len) { int avg = hash->entries >> hash->shift; - if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1)) + if (unlikely(avg > 1) && nl_portid_hash_rehash(hash, 1)) return 1; if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { - nl_pid_hash_rehash(hash, 0); + nl_portid_hash_rehash(hash, 0); return 1; } @@ -328,54 +1092,59 @@ static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; - struct hlist_node *node; unsigned long mask; unsigned int i; + struct listeners *listeners; + + listeners = nl_deref_protected(tbl->listeners); + if (!listeners) + return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; - sk_for_each_bound(sk, node, &tbl->mc_list) { + sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } - tbl->listeners->masks[i] = mask; + listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } -static int netlink_insert(struct sock *sk, struct net *net, u32 pid) +static int netlink_insert(struct sock *sk, struct net *net, u32 portid) { - struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; int err = -EADDRINUSE; struct sock *osk; - struct hlist_node *node; int len; netlink_table_grab(); - head = nl_pid_hashfn(hash, pid); + head = nl_portid_hashfn(hash, portid); len = 0; - sk_for_each(osk, node, head) { - if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->pid == pid)) + sk_for_each(osk, head) { + if (table->compare(net, osk) && + (nlk_sk(osk)->portid == portid)) break; len++; } - if (node) + if (osk) goto err; err = -EBUSY; - if (nlk_sk(sk)->pid) + if (nlk_sk(sk)->portid) goto err; err = -ENOMEM; if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) goto err; - if (len && nl_pid_hash_dilute(hash, len)) - head = nl_pid_hashfn(hash, pid); + if (len && nl_portid_hash_dilute(hash, len)) + head = nl_portid_hashfn(hash, portid); hash->entries++; - nlk_sk(sk)->pid = pid; + nlk_sk(sk)->portid = portid; sk_add_node(sk, head); err = 0; @@ -415,13 +1184,16 @@ static int __netlink_create(struct net *net, struct socket *sock, sock_init_data(sock, sk); nlk = nlk_sk(sk); - if (cb_mutex) + if (cb_mutex) { nlk->cb_mutex = cb_mutex; - else { + } else { nlk->cb_mutex = &nlk->cb_def_mutex; mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); +#ifdef CONFIG_NETLINK_MMAP + mutex_init(&nlk->pg_vec_lock); +#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -434,6 +1206,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; + int (*bind)(int group); + void (*unbind)(int group); int err = 0; sock->state = SS_UNCONNECTED; @@ -458,6 +1232,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, else err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; + bind = nl_table[protocol].bind; + unbind = nl_table[protocol].unbind; netlink_unlock_table(); if (err < 0) @@ -473,6 +1249,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk = nlk_sk(sock->sk); nlk->module = module; + nlk->netlink_bind = bind; + nlk->netlink_unbind = unbind; out: return err; @@ -503,11 +1281,11 @@ static int netlink_release(struct socket *sock) skb_queue_purge(&sk->sk_write_queue); - if (nlk->pid) { + if (nlk->portid) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, - .pid = nlk->pid, + .portid = nlk->portid, }; atomic_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); @@ -519,12 +1297,20 @@ static int netlink_release(struct socket *sock) if (netlink_is_kernel(sk)) { BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { - kfree(nl_table[sk->sk_protocol].listeners); + struct listeners *old; + + old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); + RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); + kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].unbind = NULL; + nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } - } else if (nlk->subscriptions) + } else if (nlk->subscriptions) { netlink_update_listeners(sk); + } netlink_table_ungrab(); kfree(nlk->groups); @@ -541,24 +1327,24 @@ static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *osk; - struct hlist_node *node; - s32 pid = task_tgid_vnr(current); + s32 portid = task_tgid_vnr(current); int err; static s32 rover = -4097; retry: cond_resched(); netlink_table_grab(); - head = nl_pid_hashfn(hash, pid); - sk_for_each(osk, node, head) { - if (!net_eq(sock_net(osk), net)) + head = nl_portid_hashfn(hash, portid); + sk_for_each(osk, head) { + if (!table->compare(net, osk)) continue; - if (nlk_sk(osk)->pid == pid) { - /* Bind collision, search negative pid values. */ - pid = rover--; + if (nlk_sk(osk)->portid == portid) { + /* Bind collision, search negative portid values. */ + portid = rover--; if (rover > -4097) rover = -4097; netlink_table_ungrab(); @@ -567,7 +1353,7 @@ retry: } netlink_table_ungrab(); - err = netlink_insert(sk, net, pid); + err = netlink_insert(sk, net, portid); if (err == -EADDRINUSE) goto retry; @@ -578,10 +1364,77 @@ retry: return err; } -static inline int netlink_capable(struct socket *sock, unsigned int flag) +/** + * __netlink_ns_capable - General netlink message capability test + * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, + struct user_namespace *user_ns, int cap) +{ + return ((nsp->flags & NETLINK_SKB_DST) || + file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(__netlink_ns_capable); + +/** + * netlink_ns_capable - General netlink message capability test + * @skb: socket buffer holding a netlink command from userspace + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool netlink_ns_capable(const struct sk_buff *skb, + struct user_namespace *user_ns, int cap) +{ + return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); +} +EXPORT_SYMBOL(netlink_ns_capable); + +/** + * netlink_capable - Netlink global message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in all user namespaces. + */ +bool netlink_capable(const struct sk_buff *skb, int cap) { - return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || - capable(CAP_NET_ADMIN); + return netlink_ns_capable(skb, &init_user_ns, cap); +} +EXPORT_SYMBOL(netlink_capable); + +/** + * netlink_net_capable - Netlink network namespace message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap over the network namespace of + * the socket we received the message from. + */ +bool netlink_net_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); +} +EXPORT_SYMBOL(netlink_net_capable); + +static inline int netlink_allowed(const struct socket *sock, unsigned int flag) +{ + return (nl_table[sock->sk->sk_protocol].flags & flag) || + ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void @@ -629,6 +1482,19 @@ static int netlink_realloc_groups(struct sock *sk) return err; } +static void netlink_unbind(int group, long unsigned int groups, + struct netlink_sock *nlk) +{ + int undo; + + if (!nlk->netlink_unbind) + return; + + for (undo = 0; undo < group; undo++) + if (test_bit(group, &groups)) + nlk->netlink_unbind(undo); +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -637,38 +1503,59 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; + long unsigned int groups = nladdr->nl_groups; + + if (addr_len < sizeof(struct sockaddr_nl)) + return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; /* Only superuser is allowed to listen multicasts */ - if (nladdr->nl_groups) { - if (!netlink_capable(sock, NL_NONROOT_RECV)) + if (groups) { + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } - if (nlk->pid) { - if (nladdr->nl_pid != nlk->pid) + if (nlk->portid) + if (nladdr->nl_pid != nlk->portid) return -EINVAL; - } else { + + if (nlk->netlink_bind && groups) { + int group; + + for (group = 0; group < nlk->ngroups; group++) { + if (!test_bit(group, &groups)) + continue; + err = nlk->netlink_bind(group); + if (!err) + continue; + netlink_unbind(group, groups, nlk); + return err; + } + } + + if (!nlk->portid) { err = nladdr->nl_pid ? netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); - if (err) + if (err) { + netlink_unbind(nlk->ngroups - 1, groups, nlk); return err; + } } - if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) return 0; netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + - hweight32(nladdr->nl_groups) - + hweight32(groups) - hweight32(nlk->groups[0])); - nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); @@ -688,23 +1575,23 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family == AF_UNSPEC) { sk->sk_state = NETLINK_UNCONNECTED; - nlk->dst_pid = 0; + nlk->dst_portid = 0; nlk->dst_group = 0; return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; - /* Only superuser is allowed to send multicasts */ - if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + if ((nladdr->nl_groups || nladdr->nl_pid) && + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->pid) + if (!nlk->portid) err = netlink_autobind(sock); if (err == 0) { sk->sk_state = NETLINK_CONNECTED; - nlk->dst_pid = nladdr->nl_pid; + nlk->dst_portid = nladdr->nl_pid; nlk->dst_group = ffs(nladdr->nl_groups); } @@ -723,41 +1610,28 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, *addr_len = sizeof(*nladdr); if (peer) { - nladdr->nl_pid = nlk->dst_pid; + nladdr->nl_pid = nlk->dst_portid; nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { - nladdr->nl_pid = nlk->pid; + nladdr->nl_pid = nlk->portid; nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; } return 0; } -static void netlink_overrun(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } - } - atomic_inc(&sk->sk_drops); -} - -static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) +static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; - sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, pid); + sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); if (sock->sk_state == NETLINK_CONNECTED && - nlk->dst_pid != nlk_sk(ssk)->pid) { + nlk->dst_portid != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } @@ -766,7 +1640,7 @@ static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) struct sock *netlink_getsockbyfilp(struct file *filp) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) @@ -780,6 +1654,33 @@ struct sock *netlink_getsockbyfilp(struct file *filp) return sock; } +static struct sk_buff *netlink_alloc_large_skb(unsigned int size, + int broadcast) +{ + struct sk_buff *skb; + void *data; + + if (size <= NLMSG_GOODSIZE || broadcast) + return alloc_skb(size, GFP_KERNEL); + + size = SKB_DATA_ALIGN(size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + data = vmalloc(size); + if (data == NULL) + return NULL; + + skb = build_skb(data, size); + if (skb == NULL) + vfree(data); + else { + skb->head_frag = 0; + skb->destructor = netlink_skb_destructor; + } + + return skb; +} + /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the @@ -797,8 +1698,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) { + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(NETLINK_CONGESTED, &nlk->state)) && + !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -812,7 +1714,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(NETLINK_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -826,16 +1728,32 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, } return 1; } - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); return 0; } -int netlink_sendskb(struct sock *sk, struct sk_buff *skb) +static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, len); + netlink_deliver_tap(skb); + +#ifdef CONFIG_NETLINK_MMAP + if (netlink_skb_is_mmaped(skb)) + netlink_queue_mmaped_skb(sk, skb); + else if (netlink_rx_is_mmaped(sk)) + netlink_ring_set_copied(sk, skb); + else +#endif /* CONFIG_NETLINK_MMAP */ + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk); + return len; +} + +int netlink_sendskb(struct sock *sk, struct sk_buff *skb) +{ + int len = __netlink_sendskb(sk, skb); + sock_put(sk); return len; } @@ -846,22 +1764,23 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb) sock_put(sk); } -static inline struct sk_buff *netlink_trim(struct sk_buff *skb, - gfp_t allocation) +static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; - skb_orphan(skb); + WARN_ON(skb->sk != NULL); + if (netlink_skb_is_mmaped(skb)) + return skb; delta = skb->end - skb->tail; - if (delta * 2 < skb->truesize) + if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; - kfree_skb(skb); + consume_skb(skb); skb = nskb; } @@ -871,17 +1790,8 @@ static inline struct sk_buff *netlink_trim(struct sk_buff *skb, return skb; } -static inline void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - -static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb) +static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, + struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); @@ -889,16 +1799,20 @@ static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb) ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); + NETLINK_CB(skb).sk = ssk; + netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); + consume_skb(skb); + } else { + kfree_skb(skb); } - kfree_skb(skb); sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, - u32 pid, int nonblock) + u32 portid, int nonblock) { struct sock *sk; int err; @@ -908,13 +1822,13 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, timeo = sock_sndtimeo(ssk, nonblock); retry: - sk = netlink_getsockbypid(ssk, pid); + sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) - return netlink_unicast_kernel(sk, skb); + return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; @@ -933,6 +1847,73 @@ retry: } EXPORT_SYMBOL(netlink_unicast); +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ +#ifdef CONFIG_NETLINK_MMAP + struct sock *sk = NULL; + struct sk_buff *skb; + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + unsigned int maxlen; + + sk = netlink_getsockbyportid(ssk, dst_portid); + if (IS_ERR(sk)) + goto out; + + ring = &nlk_sk(sk)->rx_ring; + /* fast-path without atomic ops for common case: non-mmaped receiver */ + if (ring->pg_vec == NULL) + goto out_put; + + if (ring->frame_size - NL_MMAP_HDRLEN < size) + goto out_put; + + skb = alloc_skb_head(gfp_mask); + if (skb == NULL) + goto err1; + + spin_lock_bh(&sk->sk_receive_queue.lock); + /* check again under lock */ + if (ring->pg_vec == NULL) + goto out_free; + + /* check again under lock */ + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + if (maxlen < size) + goto out_free; + + netlink_forward_ring(ring); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + netlink_increment_head(ring); + + spin_unlock_bh(&sk->sk_receive_queue.lock); + return skb; + +err2: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + netlink_overrun(sk); +err1: + sock_put(sk); + return NULL; + +out_free: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); +out_put: + sock_put(sk); +out: +#endif + return alloc_skb(size, gfp_mask); +} +EXPORT_SYMBOL_GPL(netlink_alloc_skb); + int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -943,7 +1924,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); - if (group - 1 < nl_table[sk->sk_protocol].groups) + if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); @@ -952,17 +1933,15 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) } EXPORT_SYMBOL_GPL(netlink_has_listeners); -static inline int netlink_broadcast_deliver(struct sock *sk, - struct sk_buff *skb) +static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(0, &nlk->state)) { - skb_set_owner_r(skb, sk); - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); - return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; + !test_bit(NETLINK_CONGESTED, &nlk->state)) { + netlink_skb_set_owner_r(skb, sk); + __netlink_sendskb(sk, skb); + return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } return -1; } @@ -970,7 +1949,7 @@ static inline int netlink_broadcast_deliver(struct sock *sk, struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; - u32 pid; + u32 portid; u32 group; int failure; int delivery_failure; @@ -982,7 +1961,7 @@ struct netlink_broadcast_data { void *tx_data; }; -static inline int do_one_broadcast(struct sock *sk, +static int do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); @@ -991,7 +1970,7 @@ static inline int do_one_broadcast(struct sock *sk, if (p->exclude_sk == sk) goto out; - if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; @@ -1043,21 +2022,20 @@ out: return 0; } -int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid, +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; - struct hlist_node *node; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; - info.pid = pid; + info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; @@ -1073,7 +2051,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid, netlink_lock_table(); - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); @@ -1083,8 +2061,8 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid, if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; - } else - consume_skb(info.skb2); + } + consume_skb(info.skb2); if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT)) @@ -1095,23 +2073,22 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid, } EXPORT_SYMBOL(netlink_broadcast_filtered); -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { - return netlink_broadcast_filtered(ssk, skb, pid, group, allocation, + return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; - u32 pid; + u32 portid; u32 group; int code; }; -static inline int do_one_set_err(struct sock *sk, - struct netlink_set_err_data *p) +static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; @@ -1122,7 +2099,7 @@ static inline int do_one_set_err(struct sock *sk, if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; - if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; @@ -1140,29 +2117,28 @@ out: /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() - * @pid: the PID of a process that we want to skip (if any) - * @groups: the broadcast group that will notice the error + * @portid: the PORTID of a process that we want to skip (if any) + * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ -int netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) +int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; - struct hlist_node *node; struct sock *sk; int ret = 0; info.exclude_sk = ssk; - info.pid = pid; + info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock(&nl_table_lock); - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock(&nl_table_lock); @@ -1198,7 +2174,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optlen >= sizeof(int) && + if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && + optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -1212,17 +2189,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { - if (!netlink_capable(sock, NL_NONROOT_RECV)) + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; + if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { + err = nlk->netlink_bind(val); + if (err) + return err; + } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); + if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) + nlk->netlink_unbind(val); + err = 0; break; } @@ -1236,12 +2221,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, case NETLINK_NO_ENOBUFS: if (val) { nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(0, &nlk->state); + clear_bit(NETLINK_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); - } else + } else { nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + } err = 0; break; +#ifdef CONFIG_NETLINK_MMAP + case NETLINK_RX_RING: + case NETLINK_TX_RING: { + struct nl_mmap_req req; + + /* Rings might consume more memory than queue limits, require + * CAP_NET_ADMIN. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optlen < sizeof(req)) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + err = netlink_set_ring(sk, &req, false, + optname == NETLINK_TX_RING); + break; + } +#endif /* CONFIG_NETLINK_MMAP */ default: err = -ENOPROTOOPT; } @@ -1314,21 +2319,21 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr = msg->msg_name; - u32 dst_pid; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); + u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; + u32 netlink_skb_flags = 0; if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; - if (NULL == siocb->scm) { + if (NULL == siocb->scm) siocb->scm = &scm; - memset(&scm, 0, sizeof(scm)); - } - err = scm_send(sock, msg, siocb->scm); + + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; @@ -1336,42 +2341,43 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, err = -EINVAL; if (addr->nl_family != AF_NETLINK) goto out; - dst_pid = addr->nl_pid; + dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; - if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + if ((dst_group || dst_portid) && + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; + netlink_skb_flags |= NETLINK_SKB_DST; } else { - dst_pid = nlk->dst_pid; + dst_portid = nlk->dst_portid; dst_group = nlk->dst_group; } - if (!nlk->pid) { + if (!nlk->portid) { err = netlink_autobind(sock); if (err) goto out; } + if (netlink_tx_is_mmaped(sk) && + msg->msg_iov->iov_base == NULL) { + err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, + siocb); + goto out; + } + err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); + skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; - NETLINK_CB(skb).pid = nlk->pid; + NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).loginuid = audit_get_loginuid(current); - NETLINK_CB(skb).sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &(NETLINK_CB(skb).sid)); - memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - - /* What can I do? Netlink is asynchronous, so that - we will have to save current capabilities to - check them, when this message will be delivered - to corresponding kernel module. --ANK (980802) - */ + NETLINK_CB(skb).creds = siocb->scm->creds; + NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { @@ -1387,9 +2393,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (dst_group) { atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); + netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } - err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT); out: scm_destroy(siocb->scm); @@ -1407,7 +2413,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, int noblock = flags&MSG_DONTWAIT; size_t copied; struct sk_buff *skb, *data_skb; - int err; + int err, ret; if (flags&MSG_OOB) return -EOPNOTSUPP; @@ -1437,7 +2443,10 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } #endif - msg->msg_namelen = 0; + /* Record the max length of recvmsg() calls for future allocations */ + nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); + nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, + 16384); copied = data_skb->len; if (len < copied) { @@ -1449,10 +2458,10 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); if (msg->msg_name) { - struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; - addr->nl_pid = NETLINK_CB(skb).pid; + addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } @@ -1470,8 +2479,14 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, skb_free_datagram(sk, skb); - if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) - netlink_dump(sk); + if (nlk->cb_running && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { + ret = netlink_dump(sk); + if (ret) { + sk->sk_err = -ret; + sk->sk_error_report(sk); + } + } scm_recv(sock, msg, siocb->scm, flags); out: @@ -1479,7 +2494,7 @@ out: return err ? : copied; } -static void netlink_data_ready(struct sock *sk, int len) +static void netlink_data_ready(struct sock *sk) { BUG(); } @@ -1491,14 +2506,15 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(struct net *net, int unit, unsigned int groups, - void (*input)(struct sk_buff *skb), - struct mutex *cb_mutex, struct module *module) +__netlink_kernel_create(struct net *net, int unit, struct module *module, + struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; + struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; + unsigned int groups; BUG_ON(!nl_table); @@ -1520,16 +2536,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, sk = sock->sk; sk_change_net(sk, net); - if (groups < 32) + if (!cfg || cfg->groups < 32) groups = 32; + else + groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; - if (input) - nlk_sk(sk)->netlink_rcv = input; + if (cfg && cfg->input) + nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, net, 0)) goto out_sock_release; @@ -1543,6 +2561,12 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; + if (cfg) { + nl_table[unit].bind = cfg->bind; + nl_table[unit].flags = cfg->flags; + if (cfg->compare) + nl_table[unit].compare = cfg->compare; + } nl_table[unit].registered = 1; } else { kfree(listeners); @@ -1560,8 +2584,7 @@ out_sock_release_nosk: sock_release(sock); return NULL; } -EXPORT_SYMBOL(netlink_kernel_create); - +EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) @@ -1570,12 +2593,6 @@ netlink_kernel_release(struct sock *sk) } EXPORT_SYMBOL(netlink_kernel_release); - -static void listeners_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct listeners, rcu)); -} - int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; @@ -1588,11 +2605,11 @@ int __netlink_change_ngroups(struct sock *sk, unsigned int groups) new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; - old = rcu_dereference_raw(tbl->listeners); + old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); - call_rcu(&old->rcu, listeners_free_rcu); + kfree_rcu(old, rcu); } tbl->groups = groups; @@ -1625,40 +2642,29 @@ int netlink_change_ngroups(struct sock *sk, unsigned int groups) void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; - struct hlist_node *node; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; - sk_for_each_bound(sk, node, &tbl->mc_list) + sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } -/** - * netlink_clear_multicast_users - kick off multicast listeners - * - * This function removes all listeners from the given group. - * @ksk: The kernel netlink socket, as returned by - * netlink_kernel_create(). - * @group: The multicast group to clear. - */ -void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) -{ - netlink_table_grab(); - __netlink_clear_multicast_users(ksk, group); - netlink_table_ungrab(); -} - -void netlink_set_nonroot(int protocol, unsigned int flags) +struct nlmsghdr * +__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { - if ((unsigned int)protocol < MAX_LINKS) - nl_table[protocol].nl_nonroot = flags; -} -EXPORT_SYMBOL(netlink_set_nonroot); + struct nlmsghdr *nlh; + int size = nlmsg_msg_size(len); -static void netlink_destroy_callback(struct netlink_callback *cb) -{ - kfree_skb(cb->skb); - kfree(cb); + nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size)); + nlh->nlmsg_type = type; + nlh->nlmsg_len = size; + nlh->nlmsg_flags = flags; + nlh->nlmsg_pid = portid; + nlh->nlmsg_seq = seq; + if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) + memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); + return nlh; } +EXPORT_SYMBOL(__nlmsg_put); /* * It looks a bit ugly. @@ -1669,22 +2675,48 @@ static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_callback *cb; - struct sk_buff *skb; + struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int len, err = -ENOBUFS; - - skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); - if (!skb) - goto errout; + int alloc_size; mutex_lock(nlk->cb_mutex); - - cb = nlk->cb; - if (cb == NULL) { + if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } + cb = &nlk->cb; + alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); + + if (!netlink_rx_is_mmaped(sk) && + atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + goto errout_skb; + + /* NLMSG_GOODSIZE is small to avoid high order allocations being + * required, but it makes sense to _attempt_ a 16K bytes allocation + * to reduce number of system calls on dump operations, if user + * ever provided a big enough buffer. + */ + if (alloc_size < nlk->max_recvmsg_len) { + skb = netlink_alloc_skb(sk, + nlk->max_recvmsg_len, + nlk->portid, + GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); + /* available room should be exact amount to avoid MSG_TRUNC */ + if (skb) + skb_reserve(skb, skb_tailroom(skb) - + nlk->max_recvmsg_len); + } + if (!skb) + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, + GFP_KERNEL); + if (!skb) + goto errout_skb; + netlink_skb_set_owner_r(skb, sk); + len = cb->dump(skb, cb); if (len > 0) { @@ -1692,10 +2724,8 @@ static int netlink_dump(struct sock *sk) if (sk_filter(sk, skb)) kfree_skb(skb); - else { - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); - } + else + __netlink_sendskb(sk, skb); return 0; } @@ -1703,76 +2733,102 @@ static int netlink_dump(struct sock *sk) if (!nlh) goto errout_skb; + nl_dump_check_consistent(cb, nlh); + memcpy(nlmsg_data(nlh), &len, sizeof(len)); if (sk_filter(sk, skb)) kfree_skb(skb); - else { - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); - } + else + __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); - nlk->cb = NULL; - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); + nlk->cb_running = false; + mutex_unlock(nlk->cb_mutex); + module_put(cb->module); + consume_skb(cb->skb); return 0; errout_skb: mutex_unlock(nlk->cb_mutex); kfree_skb(skb); -errout: return err; } -int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - int (*dump)(struct sk_buff *skb, - struct netlink_callback *), - int (*done)(struct netlink_callback *)) +int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) { struct netlink_callback *cb; struct sock *sk; struct netlink_sock *nlk; + int ret; - cb = kzalloc(sizeof(*cb), GFP_KERNEL); - if (cb == NULL) - return -ENOBUFS; - - cb->dump = dump; - cb->done = done; - cb->nlh = nlh; - atomic_inc(&skb->users); - cb->skb = skb; + /* Memory mapped dump requests need to be copied to avoid looping + * on the pending state in netlink_mmap_sendmsg() while the CB hold + * a reference to the skb. + */ + if (netlink_skb_is_mmaped(skb)) { + skb = skb_copy(skb, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; + } else + atomic_inc(&skb->users); - sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).pid); + sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { - netlink_destroy_callback(cb); - return -ECONNREFUSED; + ret = -ECONNREFUSED; + goto error_free; } + nlk = nlk_sk(sk); - /* A dump is in progress... */ mutex_lock(nlk->cb_mutex); - if (nlk->cb) { - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); - sock_put(sk); - return -EBUSY; + /* A dump is in progress... */ + if (nlk->cb_running) { + ret = -EBUSY; + goto error_unlock; } - nlk->cb = cb; + /* add reference of module which cb->dump belongs to */ + if (!try_module_get(control->module)) { + ret = -EPROTONOSUPPORT; + goto error_unlock; + } + + cb = &nlk->cb; + memset(cb, 0, sizeof(*cb)); + cb->dump = control->dump; + cb->done = control->done; + cb->nlh = nlh; + cb->data = control->data; + cb->module = control->module; + cb->min_dump_alloc = control->min_dump_alloc; + cb->skb = skb; + + nlk->cb_running = true; + mutex_unlock(nlk->cb_mutex); - netlink_dump(sk); + ret = netlink_dump(sk); sock_put(sk); + if (ret) + return ret; + /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; + +error_unlock: + sock_put(sk); + mutex_unlock(nlk->cb_mutex); +error_free: + kfree_skb(skb); + return ret; } -EXPORT_SYMBOL(netlink_dump_start); +EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) { @@ -1785,13 +2841,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (err) payload += nlmsg_len(nlh); - skb = nlmsg_new(payload, GFP_KERNEL); + skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), + NETLINK_CB(in_skb).portid, GFP_KERNEL); if (!skb) { struct sock *sk; sk = netlink_lookup(sock_net(in_skb->sk), in_skb->sk->sk_protocol, - NETLINK_CB(in_skb).pid); + NETLINK_CB(in_skb).portid); if (sk) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); @@ -1800,12 +2857,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) return; } - rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = err; memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); - netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); @@ -1855,33 +2912,33 @@ EXPORT_SYMBOL(netlink_rcv_skb); * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message - * @pid: destination netlink pid for reports or 0 + * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ -int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, +int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { - int exclude_pid = 0; + int exclude_portid = 0; if (report) { atomic_inc(&skb->users); - exclude_pid = pid; + exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ - err = nlmsg_multicast(sk, skb, exclude_pid, group, flags); + err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); } if (report) { int err2; - err2 = nlmsg_unicast(sk, skb, pid); + err2 = nlmsg_unicast(sk, skb, portid); if (!err || err == -ESRCH) err = err2; } @@ -1902,14 +2959,13 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) struct nl_seq_iter *iter = seq->private; int i, j; struct sock *s; - struct hlist_node *node; loff_t off = 0; for (i = 0; i < MAX_LINKS; i++) { - struct nl_pid_hash *hash = &nl_table[i].hash; + struct nl_portid_hash *hash = &nl_table[i].hash; for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, node, &hash->table[j]) { + sk_for_each(s, &hash->table[j]) { if (sock_net(s) != seq_file_net(seq)) continue; if (off == pos) { @@ -1935,6 +2991,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *s; struct nl_seq_iter *iter; + struct net *net; int i, j; ++*pos; @@ -1942,11 +2999,12 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return netlink_seq_socket_idx(seq, 0); + net = seq_file_net(seq); iter = seq->private; s = v; do { s = sk_next(s); - } while (s && sock_net(s) != seq_file_net(seq)); + } while (s && !nl_table[s->sk_protocol].compare(net, s)); if (s) return s; @@ -1954,11 +3012,12 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) j = iter->hash_idx + 1; do { - struct nl_pid_hash *hash = &nl_table[i].hash; + struct nl_portid_hash *hash = &nl_table[i].hash; for (; j <= hash->mask; j++) { s = sk_head(&hash->table[j]); - while (s && sock_net(s) != seq_file_net(seq)) + + while (s && !nl_table[s->sk_protocol].compare(net, s)) s = sk_next(s); if (s) { iter->link = i; @@ -1982,22 +3041,22 @@ static void netlink_seq_stop(struct seq_file *seq, void *v) static int netlink_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); - else { + } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %-8d %-8d %-8lu\n", + seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n", s, s->sk_protocol, - nlk->pid, + nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - nlk->cb, + nlk->cb_running, atomic_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) @@ -2052,7 +3111,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll = netlink_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -2060,7 +3119,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = sock_no_mmap, + .mmap = netlink_mmap, .sendpage = sock_no_sendpage, }; @@ -2073,7 +3132,7 @@ static const struct net_proto_family netlink_family_ops = { static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS - if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) + if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops)) return -ENOMEM; #endif return 0; @@ -2082,7 +3141,7 @@ static int __net_init netlink_net_init(struct net *net) static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "netlink"); + remove_proc_entry("netlink", net->proc_net); #endif } @@ -2101,6 +3160,7 @@ static void __init netlink_add_usersock_entry(void) rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; + nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } @@ -2112,7 +3172,6 @@ static struct pernet_operations __net_initdata netlink_net_ops = { static int __init netlink_proto_init(void) { - struct sk_buff *dummy_skb; int i; unsigned long limit; unsigned int order; @@ -2121,7 +3180,7 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; - BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) @@ -2137,12 +3196,12 @@ static int __init netlink_proto_init(void) order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; for (i = 0; i < MAX_LINKS; i++) { - struct nl_pid_hash *hash = &nl_table[i].hash; + struct nl_portid_hash *hash = &nl_table[i].hash; - hash->table = nl_pid_hash_zalloc(1 * sizeof(*hash->table)); + hash->table = nl_portid_hash_zalloc(1 * sizeof(*hash->table)); if (!hash->table) { while (i-- > 0) - nl_pid_hash_free(nl_table[i].hash.table, + nl_portid_hash_free(nl_table[i].hash.table, 1 * sizeof(*hash->table)); kfree(nl_table); goto panic; @@ -2151,8 +3210,12 @@ static int __init netlink_proto_init(void) hash->shift = 0; hash->mask = 0; hash->rehash_time = jiffies; + + nl_table[i].compare = netlink_compare; } + INIT_LIST_HEAD(&netlink_tap_all); + netlink_add_usersock_entry(); sock_register(&netlink_family_ops); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h new file mode 100644 index 00000000000..0b59d441f5b --- /dev/null +++ b/net/netlink/af_netlink.h @@ -0,0 +1,87 @@ +#ifndef _AF_NETLINK_H +#define _AF_NETLINK_H + +#include <net/sock.h> + +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) +#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) + +struct netlink_ring { + void **pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + + unsigned int pg_vec_order; + unsigned int pg_vec_pages; + unsigned int pg_vec_len; + + atomic_t pending; +}; + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 portid; + u32 dst_portid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + size_t max_recvmsg_len; + wait_queue_head_t wait; + bool cb_running; + struct netlink_callback cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + int (*netlink_bind)(int group); + void (*netlink_unbind)(int group); + struct module *module; +#ifdef CONFIG_NETLINK_MMAP + struct mutex pg_vec_lock; + struct netlink_ring rx_ring; + struct netlink_ring tx_ring; + atomic_t mapped; +#endif /* CONFIG_NETLINK_MMAP */ +}; + +static inline struct netlink_sock *nlk_sk(struct sock *sk) +{ + return container_of(sk, struct netlink_sock, sk); +} + +struct nl_portid_hash { + struct hlist_head *table; + unsigned long rehash_time; + + unsigned int mask; + unsigned int shift; + + unsigned int entries; + unsigned int max_shift; + + u32 rnd; +}; + +struct netlink_table { + struct nl_portid_hash hash; + struct hlist_head mc_list; + struct listeners __rcu *listeners; + unsigned int flags; + unsigned int groups; + struct mutex *cb_mutex; + struct module *module; + int (*bind)(int group); + void (*unbind)(int group); + bool (*compare)(struct net *net, struct sock *sock); + int registered; +}; + +extern struct netlink_table *nl_table; +extern rwlock_t nl_table_lock; + +#endif diff --git a/net/netlink/diag.c b/net/netlink/diag.c new file mode 100644 index 00000000000..1af29624b92 --- /dev/null +++ b/net/netlink/diag.c @@ -0,0 +1,227 @@ +#include <linux/module.h> + +#include <net/sock.h> +#include <linux/netlink.h> +#include <linux/sock_diag.h> +#include <linux/netlink_diag.h> + +#include "af_netlink.h" + +#ifdef CONFIG_NETLINK_MMAP +static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, + struct sk_buff *nlskb) +{ + struct netlink_diag_ring ndr; + + ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; + ndr.ndr_block_nr = ring->pg_vec_len; + ndr.ndr_frame_size = ring->frame_size; + ndr.ndr_frame_nr = ring->frame_max + 1; + + return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); +} + +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int ret; + + mutex_lock(&nlk->pg_vec_lock); + ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); + if (!ret) + ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, + nlskb); + mutex_unlock(&nlk->pg_vec_lock); + + return ret; +} +#else +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ + return 0; +} +#endif + +static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->groups == NULL) + return 0; + + return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups), + nlk->groups); +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_diag_req *req, + u32 portid, u32 seq, u32 flags, int sk_ino) +{ + struct nlmsghdr *nlh; + struct netlink_diag_msg *rep; + struct netlink_sock *nlk = nlk_sk(sk); + + nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); + if (!nlh) + return -EMSGSIZE; + + rep = nlmsg_data(nlh); + rep->ndiag_family = AF_NETLINK; + rep->ndiag_type = sk->sk_type; + rep->ndiag_protocol = sk->sk_protocol; + rep->ndiag_state = sk->sk_state; + + rep->ndiag_ino = sk_ino; + rep->ndiag_portid = nlk->portid; + rep->ndiag_dst_portid = nlk->dst_portid; + rep->ndiag_dst_group = nlk->dst_group; + sock_diag_save_cookie(sk, rep->ndiag_cookie); + + if ((req->ndiag_show & NDIAG_SHOW_GROUPS) && + sk_diag_dump_groups(sk, skb)) + goto out_nlmsg_trim; + + if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && + sk_diag_put_rings_cfg(sk, skb)) + goto out_nlmsg_trim; + + return nlmsg_end(skb, nlh); + +out_nlmsg_trim: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + int protocol, int s_num) +{ + struct netlink_table *tbl = &nl_table[protocol]; + struct nl_portid_hash *hash = &tbl->hash; + struct net *net = sock_net(skb->sk); + struct netlink_diag_req *req; + struct sock *sk; + int ret = 0, num = 0, i; + + req = nlmsg_data(cb->nlh); + + for (i = 0; i <= hash->mask; i++) { + sk_for_each(sk, &hash->table[i]) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { + num++; + continue; + } + + if (sk_diag_fill(sk, skb, req, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + ret = 1; + goto done; + } + + num++; + } + } + + sk_for_each_bound(sk, &tbl->mc_list) { + if (sk_hashed(sk)) + continue; + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { + num++; + continue; + } + + if (sk_diag_fill(sk, skb, req, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + ret = 1; + goto done; + } + num++; + } +done: + cb->args[0] = num; + cb->args[1] = protocol; + + return ret; +} + +static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netlink_diag_req *req; + int s_num = cb->args[0]; + + req = nlmsg_data(cb->nlh); + + read_lock(&nl_table_lock); + + if (req->sdiag_protocol == NDIAG_PROTO_ALL) { + int i; + + for (i = cb->args[1]; i < MAX_LINKS; i++) { + if (__netlink_diag_dump(skb, cb, i, s_num)) + break; + s_num = 0; + } + } else { + if (req->sdiag_protocol >= MAX_LINKS) { + read_unlock(&nl_table_lock); + return -ENOENT; + } + + __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); + } + + read_unlock(&nl_table_lock); + + return skb->len; +} + +static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct netlink_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = netlink_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } else + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler netlink_diag_handler = { + .family = AF_NETLINK, + .dump = netlink_diag_handler_dump, +}; + +static int __init netlink_diag_init(void) +{ + return sock_diag_register(&netlink_diag_handler); +} + +static void __exit netlink_diag_exit(void) +{ + sock_diag_unregister(&netlink_diag_handler); +} + +module_init(netlink_diag_init); +module_exit(netlink_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 1781d99145e..76393f2f4b2 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -16,10 +16,12 @@ #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> +#include <linux/rwsem.h> #include <net/sock.h> #include <net/genetlink.h> static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ +static DECLARE_RWSEM(cb_lock); void genl_lock(void) { @@ -33,6 +35,26 @@ void genl_unlock(void) } EXPORT_SYMBOL(genl_unlock); +#ifdef CONFIG_LOCKDEP +int lockdep_genl_is_held(void) +{ + return lockdep_is_held(&genl_mutex); +} +EXPORT_SYMBOL(lockdep_genl_is_held); +#endif + +static void genl_lock_all(void) +{ + down_write(&cb_lock); + genl_lock(); +} + +static void genl_unlock_all(void) +{ + genl_unlock(); + up_write(&cb_lock); +} + #define GENL_FAM_TAB_SIZE 16 #define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) @@ -43,12 +65,27 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE]; * To avoid an allocation at boot of just one unsigned long, * declare it global instead. * Bit 0 is marked as already used since group 0 is invalid. + * Bit 1 is marked as already used since the drop-monitor code + * abuses the API and thinks it can statically use group 1. + * That group will typically conflict with other groups that + * any proper users use. + * Bit 16 is marked as used since it's used for generic netlink + * and the code no longer marks pre-reserved IDs as used. + * Bit 17 is marked as already used since the VFS quota code + * also abused this API and relied on family == group ID, we + * cater to that by giving it a static family and group ID. + * Bit 18 is marked as already used since the PMCRAID driver + * did the same thing as the VFS quota code (maybe copied?) */ -static unsigned long mc_group_start = 0x1; +static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | + BIT(GENL_ID_VFS_DQUOT) | + BIT(GENL_ID_PMCRAID); static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; -static int genl_ctrl_event(int event, void *data); +static int genl_ctrl_event(int event, struct genl_family *family, + const struct genl_multicast_group *grp, + int grp_id); static inline unsigned int genl_family_hash(unsigned int id) { @@ -84,13 +121,13 @@ static struct genl_family *genl_family_find_byname(char *name) return NULL; } -static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) +static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) { - struct genl_ops *ops; + int i; - list_for_each_entry(ops, &family->ops_list, ops_list) - if (ops->cmd == cmd) - return ops; + for (i = 0; i < family->n_ops; i++) + if (family->ops[i].cmd == cmd) + return &family->ops[i]; return NULL; } @@ -98,13 +135,15 @@ static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) /* Of course we are going to have problems once we hit * 2^16 alive types, but that can only happen by year 2K */ -static inline u16 genl_generate_id(void) +static u16 genl_generate_id(void) { static u16 id_gen_idx = GENL_MIN_ID; int i; for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) { - if (!genl_family_find_byid(id_gen_idx)) + if (id_gen_idx != GENL_ID_VFS_DQUOT && + id_gen_idx != GENL_ID_PMCRAID && + !genl_family_find_byid(id_gen_idx)) return id_gen_idx; if (++id_gen_idx > GENL_MAX_ID) id_gen_idx = GENL_MIN_ID; @@ -113,61 +152,113 @@ static inline u16 genl_generate_id(void) return 0; } -static struct genl_multicast_group notify_grp; - -/** - * genl_register_mc_group - register a multicast group - * - * Registers the specified multicast group and notifies userspace - * about the new group. - * - * Returns 0 on success or a negative error code. - * - * @family: The generic netlink family the group shall be registered for. - * @grp: The group to register, must have a name. - */ -int genl_register_mc_group(struct genl_family *family, - struct genl_multicast_group *grp) +static int genl_allocate_reserve_groups(int n_groups, int *first_id) { - int id; unsigned long *new_groups; - int err = 0; + int start = 0; + int i; + int id; + bool fits; + + do { + if (start == 0) + id = find_first_zero_bit(mc_groups, + mc_groups_longs * + BITS_PER_LONG); + else + id = find_next_zero_bit(mc_groups, + mc_groups_longs * BITS_PER_LONG, + start); + + fits = true; + for (i = id; + i < min_t(int, id + n_groups, + mc_groups_longs * BITS_PER_LONG); + i++) { + if (test_bit(i, mc_groups)) { + start = i; + fits = false; + break; + } + } - BUG_ON(grp->name[0] == '\0'); + if (id >= mc_groups_longs * BITS_PER_LONG) { + unsigned long new_longs = mc_groups_longs + + BITS_TO_LONGS(n_groups); + size_t nlen = new_longs * sizeof(unsigned long); + + if (mc_groups == &mc_group_start) { + new_groups = kzalloc(nlen, GFP_KERNEL); + if (!new_groups) + return -ENOMEM; + mc_groups = new_groups; + *mc_groups = mc_group_start; + } else { + new_groups = krealloc(mc_groups, nlen, + GFP_KERNEL); + if (!new_groups) + return -ENOMEM; + mc_groups = new_groups; + for (i = 0; i < BITS_TO_LONGS(n_groups); i++) + mc_groups[mc_groups_longs + i] = 0; + } + mc_groups_longs = new_longs; + } + } while (!fits); - genl_lock(); + for (i = id; i < id + n_groups; i++) + set_bit(i, mc_groups); + *first_id = id; + return 0; +} + +static struct genl_family genl_ctrl; - /* special-case our own group */ - if (grp == ¬ify_grp) - id = GENL_ID_CTRL; - else - id = find_first_zero_bit(mc_groups, - mc_groups_longs * BITS_PER_LONG); +static int genl_validate_assign_mc_groups(struct genl_family *family) +{ + int first_id; + int n_groups = family->n_mcgrps; + int err = 0, i; + bool groups_allocated = false; + if (!n_groups) + return 0; - if (id >= mc_groups_longs * BITS_PER_LONG) { - size_t nlen = (mc_groups_longs + 1) * sizeof(unsigned long); + for (i = 0; i < n_groups; i++) { + const struct genl_multicast_group *grp = &family->mcgrps[i]; - if (mc_groups == &mc_group_start) { - new_groups = kzalloc(nlen, GFP_KERNEL); - if (!new_groups) { - err = -ENOMEM; - goto out; - } - mc_groups = new_groups; - *mc_groups = mc_group_start; - } else { - new_groups = krealloc(mc_groups, nlen, GFP_KERNEL); - if (!new_groups) { - err = -ENOMEM; - goto out; - } - mc_groups = new_groups; - mc_groups[mc_groups_longs] = 0; - } - mc_groups_longs++; + if (WARN_ON(grp->name[0] == '\0')) + return -EINVAL; + if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL)) + return -EINVAL; } + /* special-case our own group and hacks */ + if (family == &genl_ctrl) { + first_id = GENL_ID_CTRL; + BUG_ON(n_groups != 1); + } else if (strcmp(family->name, "NET_DM") == 0) { + first_id = 1; + BUG_ON(n_groups != 1); + } else if (family->id == GENL_ID_VFS_DQUOT) { + first_id = GENL_ID_VFS_DQUOT; + BUG_ON(n_groups != 1); + } else if (family->id == GENL_ID_PMCRAID) { + first_id = GENL_ID_PMCRAID; + BUG_ON(n_groups != 1); + } else { + groups_allocated = true; + err = genl_allocate_reserve_groups(n_groups, &first_id); + if (err) + return err; + } + + family->mcgrp_offset = first_id; + + /* if still initializing, can't and don't need to to realloc bitmaps */ + if (!init_net.genl_sock) + return 0; + if (family->netnsok) { struct net *net; @@ -183,9 +274,7 @@ int genl_register_mc_group(struct genl_family *family, * number of _possible_ groups has been * increased on some sockets which is ok. */ - rcu_read_unlock(); - netlink_table_ungrab(); - goto out; + break; } } rcu_read_unlock(); @@ -193,155 +282,66 @@ int genl_register_mc_group(struct genl_family *family, } else { err = netlink_change_ngroups(init_net.genl_sock, mc_groups_longs * BITS_PER_LONG); - if (err) - goto out; } - grp->id = id; - set_bit(id, mc_groups); - list_add_tail(&grp->list, &family->mcast_groups); - grp->family = family; + if (groups_allocated && err) { + for (i = 0; i < family->n_mcgrps; i++) + clear_bit(family->mcgrp_offset + i, mc_groups); + } - genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, grp); - out: - genl_unlock(); return err; } -EXPORT_SYMBOL(genl_register_mc_group); -static void __genl_unregister_mc_group(struct genl_family *family, - struct genl_multicast_group *grp) +static void genl_unregister_mc_groups(struct genl_family *family) { struct net *net; - BUG_ON(grp->family != family); + int i; netlink_table_grab(); rcu_read_lock(); - for_each_net_rcu(net) - __netlink_clear_multicast_users(net->genl_sock, grp->id); + for_each_net_rcu(net) { + for (i = 0; i < family->n_mcgrps; i++) + __netlink_clear_multicast_users( + net->genl_sock, family->mcgrp_offset + i); + } rcu_read_unlock(); netlink_table_ungrab(); - clear_bit(grp->id, mc_groups); - list_del(&grp->list); - genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp); - grp->id = 0; - grp->family = NULL; -} + for (i = 0; i < family->n_mcgrps; i++) { + int grp_id = family->mcgrp_offset + i; -/** - * genl_unregister_mc_group - unregister a multicast group - * - * Unregisters the specified multicast group and notifies userspace - * about it. All current listeners on the group are removed. - * - * Note: It is not necessary to unregister all multicast groups before - * unregistering the family, unregistering the family will cause - * all assigned multicast groups to be unregistered automatically. - * - * @family: Generic netlink family the group belongs to. - * @grp: The group to unregister, must have been registered successfully - * previously. - */ -void genl_unregister_mc_group(struct genl_family *family, - struct genl_multicast_group *grp) -{ - genl_lock(); - __genl_unregister_mc_group(family, grp); - genl_unlock(); -} -EXPORT_SYMBOL(genl_unregister_mc_group); - -static void genl_unregister_mc_groups(struct genl_family *family) -{ - struct genl_multicast_group *grp, *tmp; - - list_for_each_entry_safe(grp, tmp, &family->mcast_groups, list) - __genl_unregister_mc_group(family, grp); + if (grp_id != 1) + clear_bit(grp_id, mc_groups); + genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family, + &family->mcgrps[i], grp_id); + } } -/** - * genl_register_ops - register generic netlink operations - * @family: generic netlink family - * @ops: operations to be registered - * - * Registers the specified operations and assigns them to the specified - * family. Either a doit or dumpit callback must be specified or the - * operation will fail. Only one operation structure per command - * identifier may be registered. - * - * See include/net/genetlink.h for more documenation on the operations - * structure. - * - * Returns 0 on success or a negative error code. - */ -int genl_register_ops(struct genl_family *family, struct genl_ops *ops) +static int genl_validate_ops(const struct genl_family *family) { - int err = -EINVAL; - - if (ops->dumpit == NULL && ops->doit == NULL) - goto errout; + const struct genl_ops *ops = family->ops; + unsigned int n_ops = family->n_ops; + int i, j; - if (genl_get_cmd(ops->cmd, family)) { - err = -EEXIST; - goto errout; - } - - if (ops->dumpit) - ops->flags |= GENL_CMD_CAP_DUMP; - if (ops->doit) - ops->flags |= GENL_CMD_CAP_DO; - if (ops->policy) - ops->flags |= GENL_CMD_CAP_HASPOL; - - genl_lock(); - list_add_tail(&ops->ops_list, &family->ops_list); - genl_unlock(); - - genl_ctrl_event(CTRL_CMD_NEWOPS, ops); - err = 0; -errout: - return err; -} -EXPORT_SYMBOL(genl_register_ops); + if (WARN_ON(n_ops && !ops)) + return -EINVAL; -/** - * genl_unregister_ops - unregister generic netlink operations - * @family: generic netlink family - * @ops: operations to be unregistered - * - * Unregisters the specified operations and unassigns them from the - * specified family. The operation blocks until the current message - * processing has finished and doesn't start again until the - * unregister process has finished. - * - * Note: It is not necessary to unregister all operations before - * unregistering the family, unregistering the family will cause - * all assigned operations to be unregistered automatically. - * - * Returns 0 on success or a negative error code. - */ -int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) -{ - struct genl_ops *rc; + if (!n_ops) + return 0; - genl_lock(); - list_for_each_entry(rc, &family->ops_list, ops_list) { - if (rc == ops) { - list_del(&ops->ops_list); - genl_unlock(); - genl_ctrl_event(CTRL_CMD_DELOPS, ops); - return 0; - } + for (i = 0; i < n_ops; i++) { + if (ops[i].dumpit == NULL && ops[i].doit == NULL) + return -EINVAL; + for (j = i + 1; j < n_ops; j++) + if (ops[i].cmd == ops[j].cmd) + return -EINVAL; } - genl_unlock(); - return -ENOENT; + return 0; } -EXPORT_SYMBOL(genl_unregister_ops); /** - * genl_register_family - register a generic netlink family + * __genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one @@ -349,11 +349,14 @@ EXPORT_SYMBOL(genl_unregister_ops); * The family id may equal GENL_ID_GENERATE causing an unique id to * be automatically generated and assigned. * + * The family's ops array must already be assigned, you can use the + * genl_register_family_with_ops() helper function. + * * Return 0 on success or a negative error code. */ -int genl_register_family(struct genl_family *family) +int __genl_register_family(struct genl_family *family) { - int err = -EINVAL; + int err = -EINVAL, i; if (family->id && family->id < GENL_MIN_ID) goto errout; @@ -361,10 +364,11 @@ int genl_register_family(struct genl_family *family) if (family->id > GENL_MAX_ID) goto errout; - INIT_LIST_HEAD(&family->ops_list); - INIT_LIST_HEAD(&family->mcast_groups); + err = genl_validate_ops(family); + if (err) + return err; - genl_lock(); + genl_lock_all(); if (genl_family_find_byname(family->name)) { err = -EEXIST; @@ -385,7 +389,7 @@ int genl_register_family(struct genl_family *family) goto errout_locked; } - if (family->maxattr) { + if (family->maxattr && !family->parallel_ops) { family->attrbuf = kmalloc((family->maxattr+1) * sizeof(struct nlattr *), GFP_KERNEL); if (family->attrbuf == NULL) { @@ -395,65 +399,27 @@ int genl_register_family(struct genl_family *family) } else family->attrbuf = NULL; + err = genl_validate_assign_mc_groups(family); + if (err) + goto errout_locked; + list_add_tail(&family->family_list, genl_family_chain(family->id)); - genl_unlock(); + genl_unlock_all(); - genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); + /* send all events */ + genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); + for (i = 0; i < family->n_mcgrps; i++) + genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family, + &family->mcgrps[i], family->mcgrp_offset + i); return 0; errout_locked: - genl_unlock(); + genl_unlock_all(); errout: return err; } -EXPORT_SYMBOL(genl_register_family); - -/** - * genl_register_family_with_ops - register a generic netlink family - * @family: generic netlink family - * @ops: operations to be registered - * @n_ops: number of elements to register - * - * Registers the specified family and operations from the specified table. - * Only one family may be registered with the same family name or identifier. - * - * The family id may equal GENL_ID_GENERATE causing an unique id to - * be automatically generated and assigned. - * - * Either a doit or dumpit callback must be specified for every registered - * operation or the function will fail. Only one operation structure per - * command identifier may be registered. - * - * See include/net/genetlink.h for more documenation on the operations - * structure. - * - * This is equivalent to calling genl_register_family() followed by - * genl_register_ops() for every operation entry in the table taking - * care to unregister the family on error path. - * - * Return 0 on success or a negative error code. - */ -int genl_register_family_with_ops(struct genl_family *family, - struct genl_ops *ops, size_t n_ops) -{ - int err, i; - - err = genl_register_family(family); - if (err) - return err; - - for (i = 0; i < n_ops; ++i, ++ops) { - err = genl_register_ops(family, ops); - if (err) - goto err_out; - } - return 0; -err_out: - genl_unregister_family(family); - return err; -} -EXPORT_SYMBOL(genl_register_family_with_ops); +EXPORT_SYMBOL(__genl_register_family); /** * genl_unregister_family - unregister generic netlink family @@ -467,7 +433,7 @@ int genl_unregister_family(struct genl_family *family) { struct genl_family *rc; - genl_lock(); + genl_lock_all(); genl_unregister_mc_groups(family); @@ -476,33 +442,108 @@ int genl_unregister_family(struct genl_family *family) continue; list_del(&rc->family_list); - INIT_LIST_HEAD(&family->ops_list); - genl_unlock(); + family->n_ops = 0; + genl_unlock_all(); kfree(family->attrbuf); - genl_ctrl_event(CTRL_CMD_DELFAMILY, family); + genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; } - genl_unlock(); + genl_unlock_all(); return -ENOENT; } EXPORT_SYMBOL(genl_unregister_family); -static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +/** + * genlmsg_new_unicast - Allocate generic netlink message for unicast + * @payload: size of the message payload + * @info: information on destination + * @flags: the type of memory to allocate + * + * Allocates a new sk_buff large enough to cover the specified payload + * plus required Netlink headers. Will check receiving socket for + * memory mapped i/o capability and use it if enabled. Will fall back + * to non-mapped skb if message size exceeds the frame size of the ring. + */ +struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, + gfp_t flags) { - struct genl_ops *ops; - struct genl_family *family; + size_t len = nlmsg_total_size(genlmsg_total_size(payload)); + + return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); +} +EXPORT_SYMBOL_GPL(genlmsg_new_unicast); + +/** + * genlmsg_put - Add generic netlink header to netlink message + * @skb: socket buffer holding the message + * @portid: netlink portid the message is addressed to + * @seq: sequence number (usually the one of the sender) + * @family: generic netlink family + * @flags: netlink message flags + * @cmd: generic netlink command + * + * Returns pointer to user specific header + */ +void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, + struct genl_family *family, int flags, u8 cmd) +{ + struct nlmsghdr *nlh; + struct genlmsghdr *hdr; + + nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN + + family->hdrsize, flags); + if (nlh == NULL) + return NULL; + + hdr = nlmsg_data(nlh); + hdr->cmd = cmd; + hdr->version = family->version; + hdr->reserved = 0; + + return (char *) hdr + GENL_HDRLEN; +} +EXPORT_SYMBOL(genlmsg_put); + +static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + /* our ops are always const - netlink API doesn't propagate that */ + const struct genl_ops *ops = cb->data; + int rc; + + genl_lock(); + rc = ops->dumpit(skb, cb); + genl_unlock(); + return rc; +} + +static int genl_lock_done(struct netlink_callback *cb) +{ + /* our ops are always const - netlink API doesn't propagate that */ + const struct genl_ops *ops = cb->data; + int rc = 0; + + if (ops->done) { + genl_lock(); + rc = ops->done(cb); + genl_unlock(); + } + return rc; +} + +static int genl_family_rcv_msg(struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh) +{ + const struct genl_ops *ops; struct net *net = sock_net(skb->sk); struct genl_info info; struct genlmsghdr *hdr = nlmsg_data(nlh); + struct nlattr **attrbuf; int hdrlen, err; - family = genl_family_find_byid(nlh->nlmsg_type); - if (family == NULL) - return -ENOENT; - /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) return -ENOENT; @@ -516,43 +557,73 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - security_netlink_recv(skb, CAP_NET_ADMIN)) + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { + int rc; + if (ops->dumpit == NULL) return -EOPNOTSUPP; - genl_unlock(); - err = netlink_dump_start(net->genl_sock, skb, nlh, - ops->dumpit, ops->done); - genl_lock(); - return err; + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .module = family->module, + /* we have const, but the netlink API doesn't */ + .data = (void *)ops, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; + + genl_unlock(); + rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); + + } else { + struct netlink_dump_control c = { + .module = family->module, + .dump = ops->dumpit, + .done = ops->done, + }; + + rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + } + + return rc; } if (ops->doit == NULL) return -EOPNOTSUPP; - if (family->attrbuf) { - err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, + if (family->maxattr && family->parallel_ops) { + attrbuf = kmalloc((family->maxattr+1) * + sizeof(struct nlattr *), GFP_KERNEL); + if (attrbuf == NULL) + return -ENOMEM; + } else + attrbuf = family->attrbuf; + + if (attrbuf) { + err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, ops->policy); if (err < 0) - return err; + goto out; } info.snd_seq = nlh->nlmsg_seq; - info.snd_pid = NETLINK_CB(skb).pid; + info.snd_portid = NETLINK_CB(skb).portid; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; - info.attrs = family->attrbuf; + info.attrs = attrbuf; + info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); if (family->pre_doit) { err = family->pre_doit(ops, skb, &info); if (err) - return err; + goto out; } err = ops->doit(skb, &info); @@ -560,14 +631,38 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (family->post_doit) family->post_doit(ops, skb, &info); +out: + if (family->parallel_ops) + kfree(attrbuf); + + return err; +} + +static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct genl_family *family; + int err; + + family = genl_family_find_byid(nlh->nlmsg_type); + if (family == NULL) + return -ENOENT; + + if (!family->parallel_ops) + genl_lock(); + + err = genl_family_rcv_msg(family, skb, nlh); + + if (!family->parallel_ops) + genl_unlock(); + return err; } static void genl_rcv(struct sk_buff *skb) { - genl_lock(); + down_read(&cb_lock); netlink_rcv_skb(skb, &genl_rcv_msg); - genl_unlock(); + up_read(&cb_lock); } /************************************************************************** @@ -582,39 +677,49 @@ static struct genl_family genl_ctrl = { .netnsok = true, }; -static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, +static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; - hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd); + hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; - NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name); - NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id); - NLA_PUT_U32(skb, CTRL_ATTR_VERSION, family->version); - NLA_PUT_U32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize); - NLA_PUT_U32(skb, CTRL_ATTR_MAXATTR, family->maxattr); + if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || + nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) || + nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) || + nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) || + nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr)) + goto nla_put_failure; - if (!list_empty(&family->ops_list)) { + if (family->n_ops) { struct nlattr *nla_ops; - struct genl_ops *ops; - int idx = 1; + int i; nla_ops = nla_nest_start(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; - list_for_each_entry(ops, &family->ops_list, ops_list) { + for (i = 0; i < family->n_ops; i++) { struct nlattr *nest; + const struct genl_ops *ops = &family->ops[i]; + u32 op_flags = ops->flags; + + if (ops->dumpit) + op_flags |= GENL_CMD_CAP_DUMP; + if (ops->doit) + op_flags |= GENL_CMD_CAP_DO; + if (ops->policy) + op_flags |= GENL_CMD_CAP_HASPOL; - nest = nla_nest_start(skb, idx++); + nest = nla_nest_start(skb, i + 1); if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, CTRL_ATTR_OP_ID, ops->cmd); - NLA_PUT_U32(skb, CTRL_ATTR_OP_FLAGS, ops->flags); + if (nla_put_u32(skb, CTRL_ATTR_OP_ID, ops->cmd) || + nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags)) + goto nla_put_failure; nla_nest_end(skb, nest); } @@ -622,25 +727,29 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, nla_nest_end(skb, nla_ops); } - if (!list_empty(&family->mcast_groups)) { - struct genl_multicast_group *grp; + if (family->n_mcgrps) { struct nlattr *nla_grps; - int idx = 1; + int i; nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; - list_for_each_entry(grp, &family->mcast_groups, list) { + for (i = 0; i < family->n_mcgrps; i++) { struct nlattr *nest; + const struct genl_multicast_group *grp; + + grp = &family->mcgrps[i]; - nest = nla_nest_start(skb, idx++); + nest = nla_nest_start(skb, i + 1); if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id); - NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME, - grp->name); + if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, + family->mcgrp_offset + i) || + nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, + grp->name)) + goto nla_put_failure; nla_nest_end(skb, nest); } @@ -654,20 +763,22 @@ nla_put_failure: return -EMSGSIZE; } -static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid, - u32 seq, u32 flags, struct sk_buff *skb, - u8 cmd) +static int ctrl_fill_mcgrp_info(struct genl_family *family, + const struct genl_multicast_group *grp, + int grp_id, u32 portid, u32 seq, u32 flags, + struct sk_buff *skb, u8 cmd) { void *hdr; struct nlattr *nla_grps; struct nlattr *nest; - hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd); + hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; - NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, grp->family->name); - NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, grp->family->id); + if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || + nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id)) + goto nla_put_failure; nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) @@ -677,9 +788,10 @@ static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid, if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id); - NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME, - grp->name); + if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) || + nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, + grp->name)) + goto nla_put_failure; nla_nest_end(skb, nest); nla_nest_end(skb, nla_grps); @@ -707,7 +819,7 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) continue; if (++n < fams_to_skip) continue; - if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, CTRL_CMD_NEWFAMILY) < 0) goto errout; @@ -724,7 +836,7 @@ errout: } static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, - u32 pid, int seq, u8 cmd) + u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; @@ -733,7 +845,7 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, if (skb == NULL) return ERR_PTR(-ENOBUFS); - err = ctrl_fill_info(family, pid, seq, 0, skb, cmd); + err = ctrl_fill_info(family, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); @@ -742,8 +854,10 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, return skb; } -static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_multicast_group *grp, - u32 pid, int seq, u8 cmd) +static struct sk_buff * +ctrl_build_mcgrp_msg(struct genl_family *family, + const struct genl_multicast_group *grp, + int grp_id, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; @@ -752,7 +866,8 @@ static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_multicast_group *grp, if (skb == NULL) return ERR_PTR(-ENOBUFS); - err = ctrl_fill_mcgrp_info(grp, pid, seq, 0, skb, cmd); + err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid, + seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); @@ -784,6 +899,17 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); +#ifdef CONFIG_MODULES + if (res == NULL) { + genl_unlock(); + up_read(&cb_lock); + request_module("net-pf-%d-proto-%d-family-%s", + PF_NETLINK, NETLINK_GENERIC, name); + down_read(&cb_lock); + genl_lock(); + res = genl_family_find_byname(name); + } +#endif err = -ENOENT; } @@ -795,7 +921,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) return -ENOENT; } - msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq, + msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq, CTRL_CMD_NEWFAMILY); if (IS_ERR(msg)) return PTR_ERR(msg); @@ -803,11 +929,11 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } -static int genl_ctrl_event(int event, void *data) +static int genl_ctrl_event(int event, struct genl_family *family, + const struct genl_multicast_group *grp, + int grp_id) { struct sk_buff *msg; - struct genl_family *family; - struct genl_multicast_group *grp; /* genl is still initialising */ if (!init_net.genl_sock) @@ -816,14 +942,13 @@ static int genl_ctrl_event(int event, void *data) switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: - family = data; + WARN_ON(grp); msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: - grp = data; - family = grp->family; - msg = ctrl_build_mcgrp_msg(data, 0, 0, event); + BUG_ON(!grp); + msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event); break; default: return -EINVAL; @@ -833,34 +958,40 @@ static int genl_ctrl_event(int event, void *data) return PTR_ERR(msg); if (!family->netnsok) { - genlmsg_multicast_netns(&init_net, msg, 0, - GENL_ID_CTRL, GFP_KERNEL); + genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, + 0, GFP_KERNEL); } else { rcu_read_lock(); - genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC); + genlmsg_multicast_allns(&genl_ctrl, msg, 0, + 0, GFP_ATOMIC); rcu_read_unlock(); } return 0; } -static struct genl_ops genl_ctrl_ops = { - .cmd = CTRL_CMD_GETFAMILY, - .doit = ctrl_getfamily, - .dumpit = ctrl_dumpfamily, - .policy = ctrl_policy, +static struct genl_ops genl_ctrl_ops[] = { + { + .cmd = CTRL_CMD_GETFAMILY, + .doit = ctrl_getfamily, + .dumpit = ctrl_dumpfamily, + .policy = ctrl_policy, + }, }; -static struct genl_multicast_group notify_grp = { - .name = "notify", +static struct genl_multicast_group genl_ctrl_groups[] = { + { .name = "notify", }, }; static int __net_init genl_pernet_init(struct net *net) { + struct netlink_kernel_cfg cfg = { + .input = genl_rcv, + .flags = NL_CFG_F_NONROOT_RECV, + }; + /* we'll bump the group number right afterwards */ - net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0, - genl_rcv, &genl_mutex, - THIS_MODULE); + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); @@ -889,20 +1020,15 @@ static int __init genl_init(void) for (i = 0; i < GENL_FAM_TAB_SIZE; i++) INIT_LIST_HEAD(&family_ht[i]); - err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1); + err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops, + genl_ctrl_groups); if (err < 0) goto problem; - netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); - err = register_pernet_subsys(&genl_pernet_ops); if (err) goto problem; - err = genl_register_mc_group(&genl_ctrl, ¬ify_grp); - if (err < 0) - goto problem; - return 0; problem: @@ -911,7 +1037,7 @@ problem: subsys_initcall(genl_init); -static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group, +static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { struct sk_buff *tmp; @@ -926,7 +1052,7 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group, goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, - pid, group, flags); + portid, group, flags); if (err) goto error; } @@ -934,15 +1060,35 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group, prev = net; } - return nlmsg_multicast(prev->genl_sock, skb, pid, group, flags); + return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); error: kfree_skb(skb); return err; } -int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group, - gfp_t flags) +int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags) { - return genlmsg_mcast(skb, pid, group, flags); + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return -EINVAL; + group = family->mcgrp_offset + group; + return genlmsg_mcast(skb, portid, group, flags); } EXPORT_SYMBOL(genlmsg_multicast_allns); + +void genl_notify(struct genl_family *family, + struct sk_buff *skb, struct net *net, u32 portid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) +{ + struct sock *sk = net->genl_sock; + int report = 0; + + if (nlh) + report = nlmsg_report(nlh); + + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return; + group = family->mcgrp_offset + group; + nlmsg_notify(sk, skb, portid, group, report, flags); +} +EXPORT_SYMBOL(genl_notify); |
