diff options
Diffstat (limited to 'net/core/sock.c')
| -rw-r--r-- | net/core/sock.c | 352 |
1 files changed, 230 insertions, 122 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index 1e744b12fda..026e01f7027 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -93,6 +93,7 @@ #include <linux/capability.h> #include <linux/errno.h> +#include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> @@ -139,11 +140,60 @@ #include <net/tcp.h> #endif -#include <net/ll_poll.h> +#include <net/busy_poll.h> static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + + #ifdef CONFIG_MEMCG_KMEM int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -427,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -474,12 +524,6 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ - sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); - struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); @@ -573,9 +617,7 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); - struct net_device *dev; char devname[IFNAMSIZ]; - unsigned seq; if (sk->sk_bound_dev_if == 0) { len = 0; @@ -586,20 +628,9 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, if (len < IFNAMSIZ) goto out; -retry: - seq = read_seqcount_begin(&devnet_rename_seq); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); - ret = -ENODEV; - if (!dev) { - rcu_read_unlock(); + ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + if (ret) goto out; - } - - strcpy(devname, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) - goto retry; len = strlen(devname) + 1; @@ -753,7 +784,7 @@ set_rcvbuf: break; case SO_NO_CHECK: - sk->sk_no_check = valbool; + sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: @@ -900,7 +931,7 @@ set_rcvbuf: case SO_PEEK_OFF: if (sock->ops->set_peek_off) - sock->ops->set_peek_off(sk, val); + ret = sock->ops->set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; @@ -913,8 +944,8 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; -#ifdef CONFIG_NET_LL_RX_POLL - case SO_LL: +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) ret = -EPERM; @@ -926,6 +957,13 @@ set_rcvbuf: } break; #endif + + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + default: ret = -ENOPROTOOPT; break; @@ -936,8 +974,8 @@ set_rcvbuf: EXPORT_SYMBOL(sock_setsockopt); -void cred_to_ucred(struct pid *pid, const struct cred *cred, - struct ucred *ucred) +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; @@ -948,7 +986,6 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred, ucred->gid = from_kgid_munged(current_ns, cred->egid); } } -EXPORT_SYMBOL_GPL(cred_to_ucred); int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) @@ -1027,7 +1064,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_NO_CHECK: - v.val = sk->sk_no_check; + v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: @@ -1179,16 +1216,24 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; -#ifdef CONFIG_NET_LL_RX_POLL - case SO_LL: +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; #endif + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + default: return -ENOPROTOOPT; } @@ -1315,19 +1360,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) -{ - u32 classid; - - classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; -} -EXPORT_SYMBOL(sock_update_classid); -#endif - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) void sock_update_netprioidx(struct sock *sk) { if (in_interrupt()) @@ -1588,6 +1621,25 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ @@ -1654,22 +1706,6 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, EXPORT_SYMBOL(sock_wmalloc); /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - gfp_t priority) -{ - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - skb_set_owner_r(skb, sk); - return skb; - } - } - return NULL; -} - -/* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) @@ -1734,24 +1770,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1760,50 +1795,54 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1812,6 +1851,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1820,14 +1860,24 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) { int order; @@ -1836,19 +1886,17 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag->offset = 0; return true; } - if (pfrag->offset < pfrag->size) + if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } - /* We restrict high order allocations to users that can afford to wait */ - order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; - + order = SKB_FRAG_PAGE_ORDER; do { - gfp_t gfp = sk->sk_allocation; + gfp_t gfp = prio; if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; pfrag->page = alloc_pages(gfp, order); if (likely(pfrag->page)) { pfrag->offset = 0; @@ -1857,6 +1905,15 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } } while (--order >= 0); + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; @@ -2188,7 +2245,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; @@ -2305,11 +2362,13 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); -#ifdef CONFIG_NET_LL_RX_POLL +#ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_ll_poll; + sk->sk_ll_usec = sysctl_net_busy_read; #endif + sk->sk_max_pacing_rate = ~0U; + sk->sk_pacing_rate = ~0U; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2347,10 +2406,13 @@ void release_sock(struct sock *sk) if (sk->sk_backlog.tail) __release_sock(sk); + /* Warning : release_cb() might need to release sk ownership, + * ie call sock_release_ownership(sk) before us. + */ if (sk->sk_prot->release_cb) sk->sk_prot->release_cb(sk); - sk->sk_lock.owned = 0; + sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); @@ -2438,6 +2500,52 @@ void sock_enable_timestamp(struct sock *sk, int flag) } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * |
