diff options
Diffstat (limited to 'net/netlink/af_netlink.c')
| -rw-r--r-- | net/netlink/af_netlink.c | 1436 |
1 files changed, 1213 insertions, 223 deletions
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c0353d55d56..e6fac7e3db5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3,6 +3,7 @@ * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -55,93 +56,55 @@ #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <linux/if_arp.h> +#include <asm/cacheflush.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> -#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) -#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) - -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 portid; - u32 dst_portid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - void (*netlink_bind)(int group); - struct module *module; -}; +#include "af_netlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[0]; }; +/* state bits */ +#define NETLINK_CONGESTED 0x0 + +/* flags */ #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_BROADCAST_SEND_ERROR 0x4 #define NETLINK_RECV_NO_ENOBUFS 0x8 -static inline struct netlink_sock *nlk_sk(struct sock *sk) -{ - return container_of(sk, struct netlink_sock, sk); -} - static inline int netlink_is_kernel(struct sock *sk) { return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; } -struct nl_portid_hash { - struct hlist_head *table; - unsigned long rehash_time; - - unsigned int mask; - unsigned int shift; - - unsigned int entries; - unsigned int max_shift; - - u32 rnd; -}; - -struct netlink_table { - struct nl_portid_hash hash; - struct hlist_head mc_list; - struct listeners __rcu *listeners; - unsigned int flags; - unsigned int groups; - struct mutex *cb_mutex; - struct module *module; - void (*bind)(int group); - int registered; -}; - -static struct netlink_table *nl_table; +struct netlink_table *nl_table; +EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); +static void netlink_skb_destructor(struct sk_buff *skb); -static DEFINE_RWLOCK(nl_table_lock); +DEFINE_RWLOCK(nl_table_lock); +EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static ATOMIC_NOTIFIER_HEAD(netlink_chain); +static DEFINE_SPINLOCK(netlink_tap_lock); +static struct list_head netlink_tap_all __read_mostly; + static inline u32 netlink_group_mask(u32 group) { return group ? 1 << (group - 1) : 0; @@ -152,31 +115,810 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } -static void netlink_destroy_callback(struct netlink_callback *cb) +int netlink_add_tap(struct netlink_tap *nt) +{ + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) + return -EINVAL; + + spin_lock(&netlink_tap_lock); + list_add_rcu(&nt->list, &netlink_tap_all); + spin_unlock(&netlink_tap_lock); + + if (nt->module) + __module_get(nt->module); + + return 0; +} +EXPORT_SYMBOL_GPL(netlink_add_tap); + +static int __netlink_remove_tap(struct netlink_tap *nt) { - kfree_skb(cb->skb); - kfree(cb); + bool found = false; + struct netlink_tap *tmp; + + spin_lock(&netlink_tap_lock); + + list_for_each_entry(tmp, &netlink_tap_all, list) { + if (nt == tmp) { + list_del_rcu(&nt->list); + found = true; + goto out; + } + } + + pr_warn("__netlink_remove_tap: %p not found\n", nt); +out: + spin_unlock(&netlink_tap_lock); + + if (found && nt->module) + module_put(nt->module); + + return found ? 0 : -ENODEV; } -static void netlink_consume_callback(struct netlink_callback *cb) +int netlink_remove_tap(struct netlink_tap *nt) { - consume_skb(cb->skb); - kfree(cb); + int ret; + + ret = __netlink_remove_tap(nt); + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(netlink_remove_tap); + +static bool netlink_filter_tap(const struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + bool pass = false; + + /* We take the more conservative approach and + * whitelist socket protocols that may pass. + */ + switch (sk->sk_protocol) { + case NETLINK_ROUTE: + case NETLINK_USERSOCK: + case NETLINK_SOCK_DIAG: + case NETLINK_NFLOG: + case NETLINK_XFRM: + case NETLINK_FIB_LOOKUP: + case NETLINK_NETFILTER: + case NETLINK_GENERIC: + pass = true; + break; + } + + return pass; +} + +static int __netlink_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct sk_buff *nskb; + struct sock *sk = skb->sk; + int ret = -ENOMEM; + + dev_hold(dev); + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + nskb->dev = dev; + nskb->protocol = htons((u16) sk->sk_protocol); + nskb->pkt_type = netlink_is_kernel(sk) ? + PACKET_KERNEL : PACKET_USER; + + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + } + + dev_put(dev); + return ret; +} + +static void __netlink_deliver_tap(struct sk_buff *skb) +{ + int ret; + struct netlink_tap *tmp; + + if (!netlink_filter_tap(skb)) + return; + + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + ret = __netlink_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; + } +} + +static void netlink_deliver_tap(struct sk_buff *skb) +{ + rcu_read_lock(); + + if (unlikely(!list_empty(&netlink_tap_all))) + __netlink_deliver_tap(skb); + + rcu_read_unlock(); +} + +static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, + struct sk_buff *skb) +{ + if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) + netlink_deliver_tap(skb); +} + +static void netlink_overrun(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } + } + atomic_inc(&sk->sk_drops); +} + +static void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(NETLINK_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +#ifdef CONFIG_NETLINK_MMAP +static bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +} + +static bool netlink_rx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->rx_ring.pg_vec != NULL; +} + +static bool netlink_tx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->tx_ring.pg_vec != NULL; +} + +static __pure struct page *pgvec_to_page(const void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + else + return virt_to_page(addr); +} + +static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) { + if (pg_vec[i] != NULL) { + if (is_vmalloc_addr(pg_vec[i])) + vfree(pg_vec[i]); + else + free_pages((unsigned long)pg_vec[i], order); + } + } + kfree(pg_vec); +} + +static void *alloc_one_pg_vec_page(unsigned long order) +{ + void *buffer; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | + __GFP_NOWARN | __GFP_NORETRY; + + buffer = (void *)__get_free_pages(gfp_flags, order); + if (buffer != NULL) + return buffer; + + buffer = vzalloc((1 << order) * PAGE_SIZE); + if (buffer != NULL) + return buffer; + + gfp_flags &= ~__GFP_NORETRY; + return (void *)__get_free_pages(gfp_flags, order); +} + +static void **alloc_pg_vec(struct netlink_sock *nlk, + struct nl_mmap_req *req, unsigned int order) +{ + unsigned int block_nr = req->nm_block_nr; + unsigned int i; + void **pg_vec; + + pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); + if (pg_vec == NULL) + return NULL; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = alloc_one_pg_vec_page(order); + if (pg_vec[i] == NULL) + goto err1; + } + + return pg_vec; +err1: + free_pg_vec(pg_vec, order, block_nr); + return NULL; +} + +static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, + bool closing, bool tx_ring) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct sk_buff_head *queue; + void **pg_vec = NULL; + unsigned int order = 0; + int err; + + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + if (!closing) { + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; + } + + if (req->nm_block_nr) { + if (ring->pg_vec != NULL) + return -EBUSY; + + if ((int)req->nm_block_size <= 0) + return -EINVAL; + if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) + return -EINVAL; + if (req->nm_frame_size < NL_MMAP_HDRLEN) + return -EINVAL; + if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) + return -EINVAL; + + ring->frames_per_block = req->nm_block_size / + req->nm_frame_size; + if (ring->frames_per_block == 0) + return -EINVAL; + if (ring->frames_per_block * req->nm_block_nr != + req->nm_frame_nr) + return -EINVAL; + + order = get_order(req->nm_block_size); + pg_vec = alloc_pg_vec(nlk, req, order); + if (pg_vec == NULL) + return -ENOMEM; + } else { + if (req->nm_frame_nr) + return -EINVAL; + } + + err = -EBUSY; + mutex_lock(&nlk->pg_vec_lock); + if (closing || atomic_read(&nlk->mapped) == 0) { + err = 0; + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + } + mutex_unlock(&nlk->pg_vec_lock); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); + return err; +} + +static void netlink_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&nlk_sk(sk)->mapped); +} + +static void netlink_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&nlk_sk(sk)->mapped); +} + +static const struct vm_operations_struct netlink_mmap_ops = { + .open = netlink_mm_open, + .close = netlink_mm_close, +}; + +static int netlink_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + unsigned long start, size, expected; + unsigned int i; + int err = -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + mutex_lock(&nlk->pg_vec_lock); + + expected = 0; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; + } + + if (expected == 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if (size != expected) + goto out; + + start = vma->vm_start; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + + for (i = 0; i < ring->pg_vec_len; i++) { + struct page *page; + void *kaddr = ring->pg_vec[i]; + unsigned int pg_num; + + for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { + page = pgvec_to_page(kaddr); + err = vm_insert_page(vma, start, page); + if (err < 0) + goto out; + start += PAGE_SIZE; + kaddr += PAGE_SIZE; + } + } + } + + atomic_inc(&nlk->mapped); + vma->vm_ops = &netlink_mmap_ops; + err = 0; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} + +static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) +{ +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + struct page *p_start, *p_end; + + /* First page is flushed through netlink_{get,set}_status */ + p_start = pgvec_to_page(hdr + PAGE_SIZE); + p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; + } +#endif +} + +static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) +{ + smp_rmb(); + flush_dcache_page(pgvec_to_page(hdr)); + return hdr->nm_status; +} + +static void netlink_set_status(struct nl_mmap_hdr *hdr, + enum nl_mmap_status status) +{ + hdr->nm_status = status; + flush_dcache_page(pgvec_to_page(hdr)); + smp_wmb(); +} + +static struct nl_mmap_hdr * +__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) +{ + unsigned int pg_vec_pos, frame_off; + + pg_vec_pos = pos / ring->frames_per_block; + frame_off = pos % ring->frames_per_block; + + return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); +} + +static struct nl_mmap_hdr * +netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, + enum nl_mmap_status status) +{ + struct nl_mmap_hdr *hdr; + + hdr = __netlink_lookup_frame(ring, pos); + if (netlink_get_status(hdr) != status) + return NULL; + + return hdr; +} + +static struct nl_mmap_hdr * +netlink_current_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + return netlink_lookup_frame(ring, ring->head, status); +} + +static struct nl_mmap_hdr * +netlink_previous_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + unsigned int prev; + + prev = ring->head ? ring->head - 1 : ring->frame_max; + return netlink_lookup_frame(ring, prev, status); +} + +static void netlink_increment_head(struct netlink_ring *ring) +{ + ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; +} + +static void netlink_forward_ring(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) + break; + if (hdr->nm_status != NL_MMAP_STATUS_SKIP) + break; + netlink_increment_head(ring); + } while (ring->head != head); +} + +static bool netlink_dump_space(struct netlink_sock *nlk) +{ + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + unsigned int n; + + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + return false; + + n = ring->head + ring->frame_max / 2; + if (n > ring->frame_max) + n -= ring->frame_max; + + hdr = __netlink_lookup_frame(ring, n); + + return hdr->nm_status == NL_MMAP_STATUS_UNUSED; +} + +static unsigned int netlink_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int mask; + int err; + + if (nlk->rx_ring.pg_vec != NULL) { + /* Memory mapped sockets don't call recvmsg(), so flow control + * for dumps is performed here. A dump is allowed to continue + * if at least half the ring is unused. + */ + while (nlk->cb_running && netlink_dump_space(nlk)) { + err = netlink_dump(sk); + if (err < 0) { + sk->sk_err = -err; + sk->sk_error_report(sk); + break; + } + } + netlink_rcv_wake(sk); + } + + mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + netlink_forward_ring(&nlk->rx_ring); + if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + spin_lock_bh(&sk->sk_write_queue.lock); + if (nlk->tx_ring.pg_vec) { + if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLOUT | POLLWRNORM; + } + spin_unlock_bh(&sk->sk_write_queue.lock); + + return mask; +} + +static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) +{ + return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); +} + +static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, + struct netlink_ring *ring, + struct nl_mmap_hdr *hdr) +{ + unsigned int size; + void *data; + + size = ring->frame_size - NL_MMAP_HDRLEN; + data = (void *)hdr + NL_MMAP_HDRLEN; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + + skb->destructor = netlink_skb_destructor; + NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; + NETLINK_CB(skb).sk = sk; +} + +static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, + u32 dst_portid, u32 dst_group, + struct sock_iocb *siocb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + struct sk_buff *skb; + unsigned int maxlen; + bool excl = true; + int err = 0, len = 0; + + /* Netlink messages are validated by the receiver before processing. + * In order to avoid userspace changing the contents of the message + * after validation, the socket and the ring may only be used by a + * single process, otherwise we fall back to copying. + */ + if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || + atomic_read(&nlk->mapped) > 1) + excl = false; + + mutex_lock(&nlk->pg_vec_lock); + + ring = &nlk->tx_ring; + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + + do { + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); + if (hdr == NULL) { + if (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending)) + schedule(); + continue; + } + if (hdr->nm_len > maxlen) { + err = -EINVAL; + goto out; + } + + netlink_frame_flush_dcache(hdr); + + if (likely(dst_portid == 0 && dst_group == 0 && excl)) { + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + sock_hold(sk); + netlink_ring_setup_skb(skb, sk, ring, hdr); + NETLINK_CB(skb).flags |= NETLINK_SKB_TX; + __skb_put(skb, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + } else { + skb = alloc_skb(hdr->nm_len, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + __skb_put(skb, hdr->nm_len); + memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + } + + netlink_increment_head(ring); + + NETLINK_CB(skb).portid = nlk->portid; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).creds = siocb->scm->creds; + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (unlikely(dst_group)) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_portid, dst_group, + GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_portid, + msg->msg_flags & MSG_DONTWAIT); + if (err < 0) + goto out; + len += err; + + } while (hdr != NULL || + (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending))); + + if (len > 0) + err = len; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} + +static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +{ + struct nl_mmap_hdr *hdr; + + hdr = netlink_mmap_hdr(skb); + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_frame_flush_dcache(hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + + NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; + kfree_skb(skb); +} + +static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + + spin_lock_bh(&sk->sk_receive_queue.lock); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) { + spin_unlock_bh(&sk->sk_receive_queue.lock); + kfree_skb(skb); + netlink_overrun(sk); + return; + } + netlink_increment_head(ring); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_set_status(hdr, NL_MMAP_STATUS_COPY); +} + +#else /* CONFIG_NETLINK_MMAP */ +#define netlink_skb_is_mmaped(skb) false +#define netlink_rx_is_mmaped(sk) false +#define netlink_tx_is_mmaped(sk) false +#define netlink_mmap sock_no_mmap +#define netlink_poll datagram_poll +#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 +#endif /* CONFIG_NETLINK_MMAP */ + +static void netlink_skb_destructor(struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + struct nl_mmap_hdr *hdr; + struct netlink_ring *ring; + struct sock *sk; + + /* If a packet from the kernel to userspace was freed because of an + * error without being delivered to userspace, the kernel must reset + * the status. In the direction userspace to kernel, the status is + * always reset here after the packet was processed and freed. + */ + if (netlink_skb_is_mmaped(skb)) { + hdr = netlink_mmap_hdr(skb); + sk = NETLINK_CB(skb).sk; + + if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + ring = &nlk_sk(sk)->tx_ring; + } else { + if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { + hdr->nm_len = 0; + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + } + ring = &nlk_sk(sk)->rx_ring; + } + + WARN_ON(atomic_read(&ring->pending) == 0); + atomic_dec(&ring->pending); + sock_put(sk); + + skb->head = NULL; + } +#endif + if (is_vmalloc_addr(skb->head)) { + if (!skb->cloned || + !atomic_dec_return(&(skb_shinfo(skb)->dataref))) + vfree(skb->head); + + skb->head = NULL; + } + if (skb->sk != NULL) + sock_rfree(skb); +} + +static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + WARN_ON(skb->sk != NULL); + skb->sk = sk; + skb->destructor = netlink_skb_destructor; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (nlk->cb) { - if (nlk->cb->done) - nlk->cb->done(nlk->cb); + if (nlk->cb_running) { + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); - module_put(nlk->cb->module); - netlink_destroy_callback(nlk->cb); + module_put(nlk->cb.module); + kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); +#ifdef CONFIG_NETLINK_MMAP + if (1) { + struct nl_mmap_req req; + + memset(&req, 0, sizeof(req)); + if (nlk->rx_ring.pg_vec) + netlink_set_ring(sk, &req, true, false); + memset(&req, 0, sizeof(req)); + if (nlk->tx_ring.pg_vec) + netlink_set_ring(sk, &req, true, true); + } +#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -243,17 +985,23 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } +static bool netlink_compare(struct net *net, struct sock *sk) +{ + return net_eq(sock_net(sk), net); +} + static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { - struct nl_portid_hash *hash = &nl_table[protocol].hash; + struct netlink_table *table = &nl_table[protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *sk; - struct hlist_node *node; read_lock(&nl_table_lock); head = nl_portid_hashfn(hash, portid); - sk_for_each(sk, node, head) { - if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->portid == portid)) { + sk_for_each(sk, head) { + if (table->compare(net, sk) && + (nlk_sk(sk)->portid == portid)) { sock_hold(sk); goto found; } @@ -312,9 +1060,9 @@ static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow) for (i = 0; i <= omask; i++) { struct sock *sk; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; - sk_for_each_safe(sk, node, tmp, &otable[i]) + sk_for_each_safe(sk, tmp, &otable[i]) __sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid)); } @@ -344,7 +1092,6 @@ static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; - struct hlist_node *node; unsigned long mask; unsigned int i; struct listeners *listeners; @@ -355,7 +1102,7 @@ netlink_update_listeners(struct sock *sk) for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; - sk_for_each_bound(sk, node, &tbl->mc_list) { + sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } @@ -367,22 +1114,23 @@ netlink_update_listeners(struct sock *sk) static int netlink_insert(struct sock *sk, struct net *net, u32 portid) { - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; int err = -EADDRINUSE; struct sock *osk; - struct hlist_node *node; int len; netlink_table_grab(); head = nl_portid_hashfn(hash, portid); len = 0; - sk_for_each(osk, node, head) { - if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->portid == portid)) + sk_for_each(osk, head) { + if (table->compare(net, osk) && + (nlk_sk(osk)->portid == portid)) break; len++; } - if (node) + if (osk) goto err; err = -EBUSY; @@ -443,6 +1191,9 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); +#ifdef CONFIG_NETLINK_MMAP + mutex_init(&nlk->pg_vec_lock); +#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -455,7 +1206,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; - void (*bind)(int group); + int (*bind)(int group); + void (*unbind)(int group); int err = 0; sock->state = SS_UNCONNECTED; @@ -481,6 +1233,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; + unbind = nl_table[protocol].unbind; netlink_unlock_table(); if (err < 0) @@ -497,6 +1250,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; + nlk->netlink_unbind = unbind; out: return err; @@ -550,6 +1304,7 @@ static int netlink_release(struct socket *sock) kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } @@ -572,10 +1327,10 @@ static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *osk; - struct hlist_node *node; s32 portid = task_tgid_vnr(current); int err; static s32 rover = -4097; @@ -584,8 +1339,8 @@ retry: cond_resched(); netlink_table_grab(); head = nl_portid_hashfn(hash, portid); - sk_for_each(osk, node, head) { - if (!net_eq(sock_net(osk), net)) + sk_for_each(osk, head) { + if (!table->compare(net, osk)) continue; if (nlk_sk(osk)->portid == portid) { /* Bind collision, search negative portid values. */ @@ -609,7 +1364,74 @@ retry: return err; } -static inline int netlink_capable(const struct socket *sock, unsigned int flag) +/** + * __netlink_ns_capable - General netlink message capability test + * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, + struct user_namespace *user_ns, int cap) +{ + return ((nsp->flags & NETLINK_SKB_DST) || + file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(__netlink_ns_capable); + +/** + * netlink_ns_capable - General netlink message capability test + * @skb: socket buffer holding a netlink command from userspace + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool netlink_ns_capable(const struct sk_buff *skb, + struct user_namespace *user_ns, int cap) +{ + return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); +} +EXPORT_SYMBOL(netlink_ns_capable); + +/** + * netlink_capable - Netlink global message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in all user namespaces. + */ +bool netlink_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, &init_user_ns, cap); +} +EXPORT_SYMBOL(netlink_capable); + +/** + * netlink_net_capable - Netlink network namespace message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap over the network namespace of + * the socket we received the message from. + */ +bool netlink_net_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); +} +EXPORT_SYMBOL(netlink_net_capable); + +static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); @@ -660,6 +1482,19 @@ static int netlink_realloc_groups(struct sock *sk) return err; } +static void netlink_unbind(int group, long unsigned int groups, + struct netlink_sock *nlk) +{ + int undo; + + if (!nlk->netlink_unbind) + return; + + for (undo = 0; undo < group; undo++) + if (test_bit(group, &groups)) + nlk->netlink_unbind(undo); +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -668,6 +1503,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; + long unsigned int groups = nladdr->nl_groups; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -676,45 +1512,53 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; /* Only superuser is allowed to listen multicasts */ - if (nladdr->nl_groups) { - if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + if (groups) { + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } - if (nlk->portid) { + if (nlk->portid) if (nladdr->nl_pid != nlk->portid) return -EINVAL; - } else { + + if (nlk->netlink_bind && groups) { + int group; + + for (group = 0; group < nlk->ngroups; group++) { + if (!test_bit(group, &groups)) + continue; + err = nlk->netlink_bind(group); + if (!err) + continue; + netlink_unbind(group, groups, nlk); + return err; + } + } + + if (!nlk->portid) { err = nladdr->nl_pid ? netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); - if (err) + if (err) { + netlink_unbind(nlk->ngroups - 1, groups, nlk); return err; + } } - if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) return 0; netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + - hweight32(nladdr->nl_groups) - + hweight32(groups) - hweight32(nlk->groups[0])); - nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); - if (nlk->netlink_bind && nlk->groups[0]) { - int i; - - for (i=0; i<nlk->ngroups; i++) { - if (test_bit(i, nlk->groups)) - nlk->netlink_bind(i); - } - } - return 0; } @@ -738,8 +1582,8 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family != AF_NETLINK) return -EINVAL; - /* Only superuser is allowed to send multicasts */ - if (nladdr->nl_groups && !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) + if ((nladdr->nl_groups || nladdr->nl_pid) && + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; if (!nlk->portid) @@ -775,19 +1619,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, return 0; } -static void netlink_overrun(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } - } - atomic_inc(&sk->sk_drops); -} - static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; @@ -809,7 +1640,7 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) struct sock *netlink_getsockbyfilp(struct file *filp) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) @@ -823,6 +1654,33 @@ struct sock *netlink_getsockbyfilp(struct file *filp) return sock; } +static struct sk_buff *netlink_alloc_large_skb(unsigned int size, + int broadcast) +{ + struct sk_buff *skb; + void *data; + + if (size <= NLMSG_GOODSIZE || broadcast) + return alloc_skb(size, GFP_KERNEL); + + size = SKB_DATA_ALIGN(size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + data = vmalloc(size); + if (data == NULL) + return NULL; + + skb = build_skb(data, size); + if (skb == NULL) + vfree(data); + else { + skb->head_frag = 0; + skb->destructor = netlink_skb_destructor; + } + + return skb; +} + /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the @@ -840,8 +1698,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) { + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(NETLINK_CONGESTED, &nlk->state)) && + !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -855,7 +1714,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(NETLINK_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -869,7 +1728,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, } return 1; } - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); return 0; } @@ -877,8 +1736,17 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, len); + netlink_deliver_tap(skb); + +#ifdef CONFIG_NETLINK_MMAP + if (netlink_skb_is_mmaped(skb)) + netlink_queue_mmaped_skb(sk, skb); + else if (netlink_rx_is_mmaped(sk)) + netlink_ring_set_copied(sk, skb); + else +#endif /* CONFIG_NETLINK_MMAP */ + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk); return len; } @@ -900,10 +1768,12 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; - skb_orphan(skb); + WARN_ON(skb->sk != NULL); + if (netlink_skb_is_mmaped(skb)) + return skb; delta = skb->end - skb->tail; - if (delta * 2 < skb->truesize) + if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { @@ -920,16 +1790,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) return skb; } -static void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { @@ -939,8 +1799,9 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; - skb_set_owner_r(skb, sk); - NETLINK_CB(skb).ssk = ssk; + netlink_skb_set_owner_r(skb, sk); + NETLINK_CB(skb).sk = ssk; + netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { @@ -986,6 +1847,73 @@ retry: } EXPORT_SYMBOL(netlink_unicast); +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ +#ifdef CONFIG_NETLINK_MMAP + struct sock *sk = NULL; + struct sk_buff *skb; + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + unsigned int maxlen; + + sk = netlink_getsockbyportid(ssk, dst_portid); + if (IS_ERR(sk)) + goto out; + + ring = &nlk_sk(sk)->rx_ring; + /* fast-path without atomic ops for common case: non-mmaped receiver */ + if (ring->pg_vec == NULL) + goto out_put; + + if (ring->frame_size - NL_MMAP_HDRLEN < size) + goto out_put; + + skb = alloc_skb_head(gfp_mask); + if (skb == NULL) + goto err1; + + spin_lock_bh(&sk->sk_receive_queue.lock); + /* check again under lock */ + if (ring->pg_vec == NULL) + goto out_free; + + /* check again under lock */ + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + if (maxlen < size) + goto out_free; + + netlink_forward_ring(ring); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + netlink_increment_head(ring); + + spin_unlock_bh(&sk->sk_receive_queue.lock); + return skb; + +err2: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + netlink_overrun(sk); +err1: + sock_put(sk); + return NULL; + +out_free: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); +out_put: + sock_put(sk); +out: +#endif + return alloc_skb(size, gfp_mask); +} +EXPORT_SYMBOL_GPL(netlink_alloc_skb); + int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -1010,8 +1938,8 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(0, &nlk->state)) { - skb_set_owner_r(skb, sk); + !test_bit(NETLINK_CONGESTED, &nlk->state)) { + netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } @@ -1101,7 +2029,6 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; - struct hlist_node *node; struct sock *sk; skb = netlink_trim(skb, allocation); @@ -1124,7 +2051,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid netlink_lock_table(); - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); @@ -1191,7 +2118,7 @@ out: * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) - * @groups: the broadcast group that will notice the error + * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the @@ -1200,7 +2127,6 @@ out: int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; - struct hlist_node *node; struct sock *sk; int ret = 0; @@ -1212,7 +2138,7 @@ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) read_lock(&nl_table_lock); - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock(&nl_table_lock); @@ -1248,7 +2174,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optlen >= sizeof(int) && + if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && + optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -1262,20 +2189,24 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { - if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; + if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { + err = nlk->netlink_bind(val); + if (err) + return err; + } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); - - if (nlk->netlink_bind) - nlk->netlink_bind(val); + if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) + nlk->netlink_unbind(val); err = 0; break; @@ -1290,13 +2221,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, case NETLINK_NO_ENOBUFS: if (val) { nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(0, &nlk->state); + clear_bit(NETLINK_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; } err = 0; break; +#ifdef CONFIG_NETLINK_MMAP + case NETLINK_RX_RING: + case NETLINK_TX_RING: { + struct nl_mmap_req req; + + /* Rings might consume more memory than queue limits, require + * CAP_NET_ADMIN. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optlen < sizeof(req)) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + err = netlink_set_ring(sk, &req, false, + optname == NETLINK_TX_RING); + break; + } +#endif /* CONFIG_NETLINK_MMAP */ default: err = -ENOPROTOOPT; } @@ -1369,12 +2319,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; + u32 netlink_skb_flags = 0; if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; @@ -1394,8 +2345,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && - !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; + netlink_skb_flags |= NETLINK_SKB_DST; } else { dst_portid = nlk->dst_portid; dst_group = nlk->dst_group; @@ -1407,17 +2359,25 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } + if (netlink_tx_is_mmaped(sk) && + msg->msg_iov->iov_base == NULL) { + err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, + siocb); + goto out; + } + err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); + skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = siocb->scm->creds; + NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { @@ -1483,7 +2443,10 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } #endif - msg->msg_namelen = 0; + /* Record the max length of recvmsg() calls for future allocations */ + nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); + nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, + 16384); copied = data_skb->len; if (len < copied) { @@ -1495,7 +2458,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); if (msg->msg_name) { - struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; @@ -1516,10 +2479,11 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, skb_free_datagram(sk, skb); - if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { + if (nlk->cb_running && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { - sk->sk_err = ret; + sk->sk_err = -ret; sk->sk_error_report(sk); } } @@ -1530,7 +2494,7 @@ out: return err ? : copied; } -static void netlink_data_ready(struct sock *sk, int len) +static void netlink_data_ready(struct sock *sk) { BUG(); } @@ -1600,6 +2564,8 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].flags = cfg->flags; + if (cfg->compare) + nl_table[unit].compare = cfg->compare; } nl_table[unit].registered = 1; } else { @@ -1676,42 +2642,26 @@ int netlink_change_ngroups(struct sock *sk, unsigned int groups) void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; - struct hlist_node *node; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; - sk_for_each_bound(sk, node, &tbl->mc_list) + sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } -/** - * netlink_clear_multicast_users - kick off multicast listeners - * - * This function removes all listeners from the given group. - * @ksk: The kernel netlink socket, as returned by - * netlink_kernel_create(). - * @group: The multicast group to clear. - */ -void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) -{ - netlink_table_grab(); - __netlink_clear_multicast_users(ksk, group); - netlink_table_ungrab(); -} - struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; - int size = NLMSG_LENGTH(len); + int size = nlmsg_msg_size(len); - nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size)); + nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) - memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size); + memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); @@ -1731,18 +2681,41 @@ static int netlink_dump(struct sock *sk) int alloc_size; mutex_lock(nlk->cb_mutex); - - cb = nlk->cb; - if (cb == NULL) { + if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } + cb = &nlk->cb; alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); + if (!netlink_rx_is_mmaped(sk) && + atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + goto errout_skb; + + /* NLMSG_GOODSIZE is small to avoid high order allocations being + * required, but it makes sense to _attempt_ a 16K bytes allocation + * to reduce number of system calls on dump operations, if user + * ever provided a big enough buffer. + */ + if (alloc_size < nlk->max_recvmsg_len) { + skb = netlink_alloc_skb(sk, + nlk->max_recvmsg_len, + nlk->portid, + GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); + /* available room should be exact amount to avoid MSG_TRUNC */ + if (skb) + skb_reserve(skb, skb_tailroom(skb) - + nlk->max_recvmsg_len); + } + if (!skb) + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, + GFP_KERNEL); if (!skb) goto errout_skb; + netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -1771,11 +2744,11 @@ static int netlink_dump(struct sock *sk) if (cb->done) cb->done(cb); - nlk->cb = NULL; - mutex_unlock(nlk->cb_mutex); + nlk->cb_running = false; + mutex_unlock(nlk->cb_mutex); module_put(cb->module); - netlink_consume_callback(cb); + consume_skb(cb->skb); return 0; errout_skb: @@ -1793,47 +2766,51 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct netlink_sock *nlk; int ret; - cb = kzalloc(sizeof(*cb), GFP_KERNEL); - if (cb == NULL) - return -ENOBUFS; - - cb->dump = control->dump; - cb->done = control->done; - cb->nlh = nlh; - cb->data = control->data; - cb->module = control->module; - cb->min_dump_alloc = control->min_dump_alloc; - atomic_inc(&skb->users); - cb->skb = skb; + /* Memory mapped dump requests need to be copied to avoid looping + * on the pending state in netlink_mmap_sendmsg() while the CB hold + * a reference to the skb. + */ + if (netlink_skb_is_mmaped(skb)) { + skb = skb_copy(skb, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; + } else + atomic_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { - netlink_destroy_callback(cb); - return -ECONNREFUSED; + ret = -ECONNREFUSED; + goto error_free; } - nlk = nlk_sk(sk); + nlk = nlk_sk(sk); mutex_lock(nlk->cb_mutex); /* A dump is in progress... */ - if (nlk->cb) { - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); + if (nlk->cb_running) { ret = -EBUSY; - goto out; + goto error_unlock; } /* add reference of module which cb->dump belongs to */ - if (!try_module_get(cb->module)) { - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); + if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; - goto out; + goto error_unlock; } - nlk->cb = cb; + cb = &nlk->cb; + memset(cb, 0, sizeof(*cb)); + cb->dump = control->dump; + cb->done = control->done; + cb->nlh = nlh; + cb->data = control->data; + cb->module = control->module; + cb->min_dump_alloc = control->min_dump_alloc; + cb->skb = skb; + + nlk->cb_running = true; + mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); -out: sock_put(sk); if (ret) @@ -1843,6 +2820,13 @@ out: * signal not to send ACK even if it was requested. */ return -EINTR; + +error_unlock: + sock_put(sk); + mutex_unlock(nlk->cb_mutex); +error_free: + kfree_skb(skb); + return ret; } EXPORT_SYMBOL(__netlink_dump_start); @@ -1857,7 +2841,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (err) payload += nlmsg_len(nlh); - skb = nlmsg_new(payload, GFP_KERNEL); + skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), + NETLINK_CB(in_skb).portid, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -1974,14 +2959,13 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) struct nl_seq_iter *iter = seq->private; int i, j; struct sock *s; - struct hlist_node *node; loff_t off = 0; for (i = 0; i < MAX_LINKS; i++) { struct nl_portid_hash *hash = &nl_table[i].hash; for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, node, &hash->table[j]) { + sk_for_each(s, &hash->table[j]) { if (sock_net(s) != seq_file_net(seq)) continue; if (off == pos) { @@ -2007,6 +2991,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *s; struct nl_seq_iter *iter; + struct net *net; int i, j; ++*pos; @@ -2014,11 +2999,12 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return netlink_seq_socket_idx(seq, 0); + net = seq_file_net(seq); iter = seq->private; s = v; do { s = sk_next(s); - } while (s && sock_net(s) != seq_file_net(seq)); + } while (s && !nl_table[s->sk_protocol].compare(net, s)); if (s) return s; @@ -2030,7 +3016,8 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) for (; j <= hash->mask; j++) { s = sk_head(&hash->table[j]); - while (s && sock_net(s) != seq_file_net(seq)) + + while (s && !nl_table[s->sk_protocol].compare(net, s)) s = sk_next(s); if (s) { iter->link = i; @@ -2062,14 +3049,14 @@ static int netlink_seq_show(struct seq_file *seq, void *v) struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %pK %-8d %-8d %-8lu\n", + seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - nlk->cb, + nlk->cb_running, atomic_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) @@ -2124,7 +3111,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll = netlink_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -2132,7 +3119,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = sock_no_mmap, + .mmap = netlink_mmap, .sendpage = sock_no_sendpage, }; @@ -2145,7 +3132,7 @@ static const struct net_proto_family netlink_family_ops = { static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS - if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) + if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops)) return -ENOMEM; #endif return 0; @@ -2154,7 +3141,7 @@ static int __net_init netlink_net_init(struct net *net) static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "netlink"); + remove_proc_entry("netlink", net->proc_net); #endif } @@ -2185,7 +3172,6 @@ static struct pernet_operations __net_initdata netlink_net_ops = { static int __init netlink_proto_init(void) { - struct sk_buff *dummy_skb; int i; unsigned long limit; unsigned int order; @@ -2194,7 +3180,7 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; - BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) @@ -2224,8 +3210,12 @@ static int __init netlink_proto_init(void) hash->shift = 0; hash->mask = 0; hash->rehash_time = jiffies; + + nl_table[i].compare = netlink_compare; } + INIT_LIST_HEAD(&netlink_tap_all); + netlink_add_usersock_entry(); sock_register(&netlink_family_ops); |
