diff options
Diffstat (limited to 'net/core/sock.c')
| -rw-r--r-- | net/core/sock.c | 2737 |
1 files changed, 1986 insertions, 751 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index 1a7e6eac90b..026e01f7027 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,8 +7,6 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> @@ -34,7 +32,7 @@ * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer - * code. The ACK stuff can wait and needs major + * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. @@ -91,9 +89,11 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> -#include <linux/config.h> #include <linux/errno.h> +#include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> @@ -112,42 +112,273 @@ #include <linux/poll.h> #include <linux/tcp.h> #include <linux/init.h> +#include <linux/highmem.h> +#include <linux/user_namespace.h> +#include <linux/static_key.h> +#include <linux/memcontrol.h> +#include <linux/prefetch.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> +#include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h> #include <linux/filter.h> +#include <trace/events/sock.h> + #ifdef CONFIG_INET #include <net/tcp.h> #endif +#include <net/busy_poll.h> + +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + + +#ifdef CONFIG_MEMCG_KMEM +int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +{ + struct proto *proto; + int ret = 0; + + mutex_lock(&proto_list_mutex); + list_for_each_entry(proto, &proto_list, node) { + if (proto->init_cgroup) { + ret = proto->init_cgroup(memcg, ss); + if (ret) + goto out; + } + } + + mutex_unlock(&proto_list_mutex); + return ret; +out: + list_for_each_entry_continue_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(memcg); + mutex_unlock(&proto_list_mutex); + return ret; +} + +void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) +{ + struct proto *proto; + + mutex_lock(&proto_list_mutex); + list_for_each_entry_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(memcg); + mutex_unlock(&proto_list_mutex); +} +#endif + +/* + * Each address family might have different locking rules, so we have + * one slock key per address family: + */ +static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_slock_keys[AF_MAX]; + +#if defined(CONFIG_MEMCG_KMEM) +struct static_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); +#endif + +/* + * Make lock validator output more readable. (we pre-construct these + * strings build-time, so that runtime initialization of socket + * locks is fast): + */ +static const char *const af_family_key_strings[AF_MAX+1] = { + "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , + "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", + "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , + "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , + "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , + "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , + "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , + "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , + "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , + "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , + "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , + "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" +}; +static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , + "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", + "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , + "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , + "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , + "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , + "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , + "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , + "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , + "slock-27" , "slock-28" , "slock-AF_CAN" , + "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , + "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" +}; +static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , + "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", + "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , + "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , + "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , + "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , + "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , + "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , + "clock-27" , "clock-28" , "clock-AF_CAN" , + "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , + "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" +}; + +/* + * sk_callback_lock locking rules are per-address-family, + * so split the lock classes by using a per-AF key: + */ +static struct lock_class_key af_callback_keys[AF_MAX]; + /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max = SK_WMEM_MAX; -__u32 sysctl_rmem_max = SK_RMEM_MAX; -__u32 sysctl_wmem_default = SK_WMEM_MAX; -__u32 sysctl_rmem_default = SK_RMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +EXPORT_SYMBOL(sysctl_wmem_max); +__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +EXPORT_SYMBOL(sysctl_rmem_max); +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancillary data plus some space */ +int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +EXPORT_SYMBOL(sysctl_optmem_max); + +struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(memalloc_socks); + +/** + * sk_set_memalloc - sets %SOCK_MEMALLOC + * @sk: socket to set it on + * + * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. + * It's the responsibility of the admin to adjust min_free_kbytes + * to meet the requirements + */ +void sk_set_memalloc(struct sock *sk) +{ + sock_set_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation |= __GFP_MEMALLOC; + static_key_slow_inc(&memalloc_socks); +} +EXPORT_SYMBOL_GPL(sk_set_memalloc); -/* Maximal space eaten by iovec or ancilliary data plus some space */ -int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); +void sk_clear_memalloc(struct sock *sk) +{ + sock_reset_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation &= ~__GFP_MEMALLOC; + static_key_slow_dec(&memalloc_socks); + + /* + * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward + * progress of swapping. However, if SOCK_MEMALLOC is cleared while + * it has rmem allocations there is a risk that the user of the + * socket cannot make forward progress due to exceeding the rmem + * limits. By rights, sk_clear_memalloc() should only be called + * on sockets being torn down but warn and reset the accounting if + * that assumption breaks. + */ + if (WARN_ON(sk->sk_forward_alloc)) + sk_mem_reclaim(sk); +} +EXPORT_SYMBOL_GPL(sk_clear_memalloc); + +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int ret; + unsigned long pflags = current->flags; + + /* these should have been dropped before queueing */ + BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); + + current->flags |= PF_MEMALLOC; + ret = sk->sk_backlog_rcv(sk, skb); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + + return ret; +} +EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { @@ -157,7 +388,20 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) return -EINVAL; if (copy_from_user(&tv, optval, sizeof(tv))) return -EFAULT; + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) + return -EDOM; + + if (tv.tv_sec < 0) { + static int warned __read_mostly; + *timeo_p = 0; + if (warned < 10 && net_ratelimit()) { + warned++; + pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", + __func__, current->comm, task_pid_nr(current)); + } + return 0; + } *timeo_p = MAX_SCHEDULE_TIMEOUT; if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; @@ -170,505 +414,928 @@ static void sock_warn_obsolete_bsdism(const char *name) { static int warned; static char warncomm[TASK_COMM_LEN]; - if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); - printk(KERN_WARNING "process `%s' is using obsolete " - "%s SO_BSDCOMPAT\n", warncomm, name); + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", + warncomm, name); warned++; } } -static void sock_disable_timestamp(struct sock *sk) -{ - if (sock_flag(sk, SOCK_TIMESTAMP)) { - sock_reset_flag(sk, SOCK_TIMESTAMP); - net_disable_timestamp(); +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags) +{ + if (sk->sk_flags & flags) { + sk->sk_flags &= ~flags; + if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + net_disable_timestamp(); } } +int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + atomic_inc(&sk->sk_drops); + trace_sock_rcvqueue_full(sk, skb); + return -ENOMEM; + } + + err = sk_filter(sk, skb); + if (err) + return err; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { + atomic_inc(&sk->sk_drops); + return -ENOBUFS; + } + + skb->dev = NULL; + skb_set_owner_r(skb, sk); + + /* Cache the SKB length before we tack it onto the receive + * queue. Once it is added it no longer belongs to us and + * may be freed by other threads of control pulling packets + * from the queue. + */ + skb_len = skb->len; + + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + + spin_lock_irqsave(&list->lock, flags); + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + return 0; +} +EXPORT_SYMBOL(sock_queue_rcv_skb); + +int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter(sk, skb)) + goto discard_and_relse; + + skb->dev = NULL; + + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + if (nested) + bh_lock_sock_nested(sk); + else + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + /* + * trylock + unlock semantics: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); + + rc = sk_backlog_rcv(sk, skb); + + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + bh_unlock_sock(sk); + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + + bh_unlock_sock(sk); +out: + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(sk_receive_skb); + +struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_tx_queue_clear(sk); + RCU_INIT_POINTER(sk->sk_dst_cache, NULL); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(__sk_dst_check); + +struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = sk_dst_get(sk); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_dst_reset(sk); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(sk_dst_check); + +static int sock_setbindtodevice(struct sock *sk, char __user *optval, + int optlen) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + int index; + + /* Sorry... */ + ret = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + goto out; + + ret = -EINVAL; + if (optlen < 0) + goto out; + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + memset(devname, 0, sizeof(devname)); + + ret = -EFAULT; + if (copy_from_user(devname, optval, optlen)) + goto out; + + index = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); + ret = -ENODEV; + if (!dev) + goto out; + } + + lock_sock(sk); + sk->sk_bound_dev_if = index; + sk_dst_reset(sk); + release_sock(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +static int sock_getbindtodevice(struct sock *sk, char __user *optval, + int __user *optlen, int len) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + + if (sk->sk_bound_dev_if == 0) { + len = 0; + goto zero; + } + + ret = -EINVAL; + if (len < IFNAMSIZ) + goto out; + + ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + if (ret) + goto out; + + len = strlen(devname) + 1; + + ret = -EFAULT; + if (copy_to_user(optval, devname, len)) + goto out; + +zero: + ret = -EFAULT; + if (put_user(len, optlen)) + goto out; + + ret = 0; + +out: +#endif + + return ret; +} + +static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +{ + if (valbool) + sock_set_flag(sk, bit); + else + sock_reset_flag(sk, bit); +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { - struct sock *sk=sock->sk; - struct sk_filter *filter; + struct sock *sk = sock->sk; int val; int valbool; struct linger ling; int ret = 0; - + /* * Options without arguments */ -#ifdef SO_DONTLINGER /* Compatibility item... */ - if (optname == SO_DONTLINGER) { - lock_sock(sk); - sock_reset_flag(sk, SOCK_LINGER); - release_sock(sk); - return 0; - } -#endif - - if(optlen<sizeof(int)) - return(-EINVAL); - + if (optname == SO_BINDTODEVICE) + return sock_setbindtodevice(sk, optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) return -EFAULT; - - valbool = val?1:0; + + valbool = val ? 1 : 0; lock_sock(sk); - switch(optname) - { - case SO_DEBUG: - if(val && !capable(CAP_NET_ADMIN)) - { - ret = -EACCES; - } - else if (valbool) - sock_set_flag(sk, SOCK_DBG); - else - sock_reset_flag(sk, SOCK_DBG); - break; - case SO_REUSEADDR: - sk->sk_reuse = valbool; - break; - case SO_TYPE: - case SO_ERROR: - ret = -ENOPROTOOPT; - break; - case SO_DONTROUTE: - if (valbool) - sock_set_flag(sk, SOCK_LOCALROUTE); - else - sock_reset_flag(sk, SOCK_LOCALROUTE); - break; - case SO_BROADCAST: - sock_valbool_flag(sk, SOCK_BROADCAST, valbool); - break; - case SO_SNDBUF: - /* Don't error on this BSD doesn't and if you think - about it this is right. Otherwise apps have to - play 'guess the biggest size' games. RCVBUF/SNDBUF - are treated in BSD as hints */ - - if (val > sysctl_wmem_max) - val = sysctl_wmem_max; + switch (optname) { + case SO_DEBUG: + if (val && !capable(CAP_NET_ADMIN)) + ret = -EACCES; + else + sock_valbool_flag(sk, SOCK_DBG, valbool); + break; + case SO_REUSEADDR: + sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); + break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + * about it this is right. Otherwise apps have to + * play 'guess the biggest size' games. RCVBUF/SNDBUF + * are treated in BSD as hints + */ + val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - if ((val * 2) < SOCK_MIN_SNDBUF) - sk->sk_sndbuf = SOCK_MIN_SNDBUF; - else - sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); + /* Wake up sending tasks if we upped the value. */ + sk->sk_write_space(sk); + break; - /* - * Wake up sending tasks if we - * upped the value. - */ - sk->sk_write_space(sk); + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; break; + } + goto set_sndbuf; - case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - goto set_sndbuf; - - case SO_RCVBUF: - /* Don't error on this BSD doesn't and if you think - about it this is right. Otherwise apps have to - play 'guess the biggest size' games. RCVBUF/SNDBUF - are treated in BSD as hints */ - - if (val > sysctl_rmem_max) - val = sysctl_rmem_max; + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + * about it this is right. Otherwise apps have to + * play 'guess the biggest size' games. RCVBUF/SNDBUF + * are treated in BSD as hints + */ + val = min_t(u32, val, sysctl_rmem_max); set_rcvbuf: - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* FIXME: is this lower bound the right one? */ - if ((val * 2) < SOCK_MIN_RCVBUF) - sk->sk_rcvbuf = SOCK_MIN_RCVBUF; - else - sk->sk_rcvbuf = val * 2; - break; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + /* + * We double it on the way in to account for + * "struct sk_buff" etc. overhead. Applications + * assume that the SO_RCVBUF setting they make will + * allow that much actual data to be received on that + * socket. + * + * Applications are unaware that "struct sk_buff" and + * other overheads allocate from the receive buffer + * during socket buffer allocation. + * + * And after considering the possible alternatives, + * returning the value we actually used in getsockopt + * is the most desirable behavior. + */ + sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); + break; - case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - goto set_rcvbuf; + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_rcvbuf; - case SO_KEEPALIVE: + case SO_KEEPALIVE: #ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP) - tcp_set_keepalive(sk, valbool); + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) + tcp_set_keepalive(sk, valbool); #endif - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check_tx = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + sk->sk_priority = val; + else + ret = -EPERM; + break; + + case SO_LINGER: + if (optlen < sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ break; - - case SO_OOBINLINE: - sock_valbool_flag(sk, SOCK_URGINLINE, valbool); - break; - - case SO_NO_CHECK: - sk->sk_no_check = valbool; - break; - - case SO_PRIORITY: - if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) - sk->sk_priority = val; - else - ret = -EPERM; + } + if (copy_from_user(&ling, optval, sizeof(ling))) { + ret = -EFAULT; break; - - case SO_LINGER: - if(optlen<sizeof(ling)) { - ret = -EINVAL; /* 1003.1g */ - break; - } - if (copy_from_user(&ling,optval,sizeof(ling))) { - ret = -EFAULT; - break; - } - if (!ling.l_onoff) - sock_reset_flag(sk, SOCK_LINGER); - else { + } + if (!ling.l_onoff) + sock_reset_flag(sk, SOCK_LINGER); + else { #if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; - else + if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + else #endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; - sock_set_flag(sk, SOCK_LINGER); - } - break; - - case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("setsockopt"); - break; - - case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); + sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("setsockopt"); + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP: + case SO_TIMESTAMPNS: + if (valbool) { + if (optname == SO_TIMESTAMP) + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); else - clear_bit(SOCK_PASSCRED, &sock->flags); - break; - - case SO_TIMESTAMP: - if (valbool) { - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk); - } else - sock_reset_flag(sk, SOCK_RCVTSTAMP); - break; - - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; - break; - - case SO_RCVTIMEO: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); - break; + sock_set_flag(sk, SOCK_RCVTSTAMPNS); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + } + break; - case SO_SNDTIMEO: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + case SO_TIMESTAMPING: + if (val & ~SOF_TIMESTAMPING_MASK) { + ret = -EINVAL; break; - -#ifdef CONFIG_NETDEVICES - case SO_BINDTODEVICE: - { - char devname[IFNAMSIZ]; - - /* Sorry... */ - if (!capable(CAP_NET_RAW)) { - ret = -EPERM; + } + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, + val & SOF_TIMESTAMPING_TX_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, + val & SOF_TIMESTAMPING_TX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, + val & SOF_TIMESTAMPING_RX_HARDWARE); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, + val & SOF_TIMESTAMPING_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, + val & SOF_TIMESTAMPING_SYS_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, + val & SOF_TIMESTAMPING_RAW_HARDWARE); + break; + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + break; + + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + break; + + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) break; - } - /* Bind this socket to a particular device like "eth0", - * as specified in the passed interface name. If the - * name is "" or the option length is zero the socket - * is not bound. - */ - - if (!valbool) { - sk->sk_bound_dev_if = 0; - } else { - if (optlen > IFNAMSIZ) - optlen = IFNAMSIZ; - if (copy_from_user(devname, optval, optlen)) { - ret = -EFAULT; - break; - } + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + ret = sk_detach_filter(sk); + break; + + case SO_LOCK_FILTER: + if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) + ret = -EPERM; + else + sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); + break; + + case SO_PASSSEC: + if (valbool) + set_bit(SOCK_PASSSEC, &sock->flags); + else + clear_bit(SOCK_PASSSEC, &sock->flags); + break; + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + ret = -EPERM; + else + sk->sk_mark = val; + break; - /* Remove any cached route for this socket. */ - sk_dst_reset(sk); - - if (devname[0] == '\0') { - sk->sk_bound_dev_if = 0; - } else { - struct net_device *dev = dev_get_by_name(devname); - if (!dev) { - ret = -ENODEV; - break; - } - sk->sk_bound_dev_if = dev->ifindex; - dev_put(dev); - } - } - break; + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + case SO_RXQ_OVFL: + sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); + break; + + case SO_WIFI_STATUS: + sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); + break; + + case SO_PEEK_OFF: + if (sock->ops->set_peek_off) + ret = sock->ops->set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + break; + + case SO_NOFCS: + sock_valbool_flag(sk, SOCK_NOFCS, valbool); + break; + + case SO_SELECT_ERR_QUEUE: + sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); + break; + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + /* allow unprivileged users to decrease the value */ + if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else { + if (val < 0) + ret = -EINVAL; + else + sk->sk_ll_usec = val; } + break; #endif + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; - case SO_ATTACH_FILTER: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; - - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; - - ret = sk_attach_filter(&fprog, sk); - } - break; - - case SO_DETACH_FILTER: - spin_lock_bh(&sk->sk_lock.slock); - filter = sk->sk_filter; - if (filter) { - sk->sk_filter = NULL; - spin_unlock_bh(&sk->sk_lock.slock); - sk_filter_release(sk, filter); - break; - } - spin_unlock_bh(&sk->sk_lock.slock); - ret = -ENONET; - break; - - /* We implement the SO_SNDLOWAT etc to - not be settable (1003.1g 5.3) */ - default: - ret = -ENOPROTOOPT; - break; - } + default: + ret = -ENOPROTOOPT; + break; + } release_sock(sk); return ret; } +EXPORT_SYMBOL(sock_setsockopt); + +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) +{ + ucred->pid = pid_vnr(pid); + ucred->uid = ucred->gid = -1; + if (cred) { + struct user_namespace *current_ns = current_user_ns(); + + ucred->uid = from_kuid_munged(current_ns, cred->euid); + ucred->gid = from_kgid_munged(current_ns, cred->egid); + } +} int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; - - union - { - int val; - struct linger ling; + + union { + int val; + struct linger ling; struct timeval tm; } v; - - unsigned int lv = sizeof(int); + + int lv = sizeof(int); int len; - - if(get_user(len,optlen)) - return -EFAULT; - if(len < 0) + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) return -EINVAL; - - switch(optname) - { - case SO_DEBUG: - v.val = sock_flag(sk, SOCK_DBG); - break; - - case SO_DONTROUTE: - v.val = sock_flag(sk, SOCK_LOCALROUTE); - break; - - case SO_BROADCAST: - v.val = !!sock_flag(sk, SOCK_BROADCAST); - break; - case SO_SNDBUF: - v.val = sk->sk_sndbuf; - break; - - case SO_RCVBUF: - v.val = sk->sk_rcvbuf; - break; + memset(&v, 0, sizeof(v)); + + switch (optname) { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = sk->sk_sndbuf; + break; + + case SO_RCVBUF: + v.val = sk->sk_rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_REUSEPORT: + v.val = sk->sk_reuseport; + break; + + case SO_KEEPALIVE: + v.val = sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_PROTOCOL: + v.val = sk->sk_protocol; + break; + + case SO_DOMAIN: + v.val = sk->sk_family; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if (v.val == 0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check_tx; + break; + + case SO_PRIORITY: + v.val = sk->sk_priority; + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = sk->sk_lingertime / HZ; + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("getsockopt"); + break; + + case SO_TIMESTAMP: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPNS: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPING: + v.val = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_TX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) + v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + break; + + case SO_RCVTIMEO: + lv = sizeof(struct timeval); + if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_rcvtimeo / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_SNDTIMEO: + lv = sizeof(struct timeval); + if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_sndtimeo / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + } + break; - case SO_REUSEADDR: - v.val = sk->sk_reuse; - break; + case SO_RCVLOWAT: + v.val = sk->sk_rcvlowat; + break; - case SO_KEEPALIVE: - v.val = !!sock_flag(sk, SOCK_KEEPOPEN); - break; + case SO_SNDLOWAT: + v.val = 1; + break; - case SO_TYPE: - v.val = sk->sk_type; - break; + case SO_PASSCRED: + v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + break; - case SO_ERROR: - v.val = -sock_error(sk); - if(v.val==0) - v.val = xchg(&sk->sk_err_soft, 0); - break; + case SO_PEERCRED: + { + struct ucred peercred; + if (len > sizeof(peercred)) + len = sizeof(peercred); + cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + if (copy_to_user(optval, &peercred, len)) + return -EFAULT; + goto lenout; + } - case SO_OOBINLINE: - v.val = !!sock_flag(sk, SOCK_URGINLINE); - break; - - case SO_NO_CHECK: - v.val = sk->sk_no_check; - break; + case SO_PEERNAME: + { + char address[128]; + + if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } - case SO_PRIORITY: - v.val = sk->sk_priority; - break; - - case SO_LINGER: - lv = sizeof(v.ling); - v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; - break; - - case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("getsockopt"); - break; + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; - case SO_TIMESTAMP: - v.val = sock_flag(sk, SOCK_RCVTSTAMP); - break; + case SO_PASSSEC: + v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + break; - case SO_RCVTIMEO: - lv=sizeof(struct timeval); - if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; - } - break; + case SO_PEERSEC: + return security_socket_getpeersec_stream(sock, optval, optlen, len); - case SO_SNDTIMEO: - lv=sizeof(struct timeval); - if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; - } - break; + case SO_MARK: + v.val = sk->sk_mark; + break; - case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; - break; + case SO_RXQ_OVFL: + v.val = sock_flag(sk, SOCK_RXQ_OVFL); + break; - case SO_SNDLOWAT: - v.val=1; - break; + case SO_WIFI_STATUS: + v.val = sock_flag(sk, SOCK_WIFI_STATUS); + break; - case SO_PASSCRED: - v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; - break; + case SO_PEEK_OFF: + if (!sock->ops->set_peek_off) + return -EOPNOTSUPP; - case SO_PEERCRED: - if (len > sizeof(sk->sk_peercred)) - len = sizeof(sk->sk_peercred); - if (copy_to_user(optval, &sk->sk_peercred, len)) - return -EFAULT; - goto lenout; - - case SO_PEERNAME: - { - char address[128]; - - if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) - return -ENOTCONN; - if (lv < len) - return -EINVAL; - if (copy_to_user(optval, address, len)) - return -EFAULT; - goto lenout; - } + v.val = sk->sk_peek_off; + break; + case SO_NOFCS: + v.val = sock_flag(sk, SOCK_NOFCS); + break; - /* Dubious BSD thing... Probably nobody even uses it, but - * the UNIX standard wants it for whatever reason... -DaveM - */ - case SO_ACCEPTCONN: - v.val = sk->sk_state == TCP_LISTEN; - break; + case SO_BINDTODEVICE: + return sock_getbindtodevice(sk, optval, optlen, len); + + case SO_GET_FILTER: + len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + if (len < 0) + return len; - case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + goto lenout; - default: - return(-ENOPROTOOPT); + case SO_LOCK_FILTER: + v.val = sock_flag(sk, SOCK_FILTER_LOCKED); + break; + + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + + case SO_SELECT_ERR_QUEUE: + v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); + break; + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + v.val = sk->sk_ll_usec; + break; +#endif + + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + + default: + return -ENOPROTOOPT; } + if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) - return -EFAULT; - return 0; + if (put_user(len, optlen)) + return -EFAULT; + return 0; } -/** - * sk_alloc - All socket objects are allocated here - * @family: protocol family - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) - * @prot: struct proto associated with this new sock instance - * @zero_it: if we should zero the newly allocated sock +/* + * Initialize an sk_lock. + * + * (We also register the sk_lock with the lock validator.) */ -struct sock *sk_alloc(int family, gfp_t priority, - struct proto *prot, int zero_it) +static inline void sock_lock_init(struct sock *sk) { - struct sock *sk = NULL; - kmem_cache_t *slab = prot->slab; + sock_lock_init_class_and_name(sk, + af_family_slock_key_strings[sk->sk_family], + af_family_slock_keys + sk->sk_family, + af_family_key_strings[sk->sk_family], + af_family_keys + sk->sk_family); +} - if (slab != NULL) - sk = kmem_cache_alloc(slab, priority); - else - sk = kmalloc(prot->obj_size, priority); +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end + */ +static void sock_copy(struct sock *nsk, const struct sock *osk) +{ +#ifdef CONFIG_SECURITY_NETWORK + void *sptr = nsk->sk_security; +#endif + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); - if (sk) { - if (zero_it) { - memset(sk, 0, prot->obj_size); - sk->sk_family = family; - /* - * See comment in struct sock definition to understand - * why we need sk_prot_creator -acme - */ - sk->sk_prot = sk->sk_prot_creator = prot; - sock_lock_init(sk); + memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + +#ifdef CONFIG_SECURITY_NETWORK + nsk->sk_security = sptr; + security_sk_clone(osk, nsk); +#endif +} + +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) +{ + unsigned long nulls1, nulls2; + + nulls1 = offsetof(struct sock, __sk_common.skc_node.next); + nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); + if (nulls1 > nulls2) + swap(nulls1, nulls2); + + if (nulls1 != 0) + memset((char *)sk, 0, nulls1); + memset((char *)sk + nulls1 + sizeof(void *), 0, + nulls2 - nulls1 - sizeof(void *)); + memset((char *)sk + nulls2 + sizeof(void *), 0, + size - nulls2 - sizeof(void *)); +} +EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); + +static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + int family) +{ + struct sock *sk; + struct kmem_cache *slab; + + slab = prot->slab; + if (slab != NULL) { + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; + if (priority & __GFP_ZERO) { + if (prot->clear_sk) + prot->clear_sk(sk, prot->obj_size); + else + sk_prot_clear_nulls(sk, prot->obj_size); } - + } else + sk = kmalloc(prot->obj_size, priority); + + if (sk != NULL) { + kmemcheck_annotate_bitfield(sk, flags); + if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) - goto out_free; + goto out_free_sec; + sk_tx_queue_clear(sk); } + return sk; +out_free_sec: + security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); @@ -677,68 +1344,183 @@ out_free: return NULL; } -void sk_free(struct sock *sk) +static void sk_prot_free(struct proto *prot, struct sock *sk) +{ + struct kmem_cache *slab; + struct module *owner; + + owner = prot->owner; + slab = prot->slab; + + security_sk_free(sk); + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + module_put(owner); +} + +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) +void sock_update_netprioidx(struct sock *sk) +{ + if (in_interrupt()) + return; + + sk->sk_cgrp_prioidx = task_netprioidx(current); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx); +#endif + +/** + * sk_alloc - All socket objects are allocated here + * @net: the applicable net namespace + * @family: protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + */ +struct sock *sk_alloc(struct net *net, int family, gfp_t priority, + struct proto *prot) +{ + struct sock *sk; + + sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); + if (sk) { + sk->sk_family = family; + /* + * See comment in struct sock definition to understand + * why we need sk_prot_creator -acme + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); + sock_net_set(sk, get_net(net)); + atomic_set(&sk->sk_wmem_alloc, 1); + + sock_update_classid(sk); + sock_update_netprioidx(sk); + } + + return sk; +} +EXPORT_SYMBOL(sk_alloc); + +static void __sk_free(struct sock *sk) { struct sk_filter *filter; - struct module *owner = sk->sk_prot_creator->owner; if (sk->sk_destruct) sk->sk_destruct(sk); - filter = sk->sk_filter; + filter = rcu_dereference_check(sk->sk_filter, + atomic_read(&sk->sk_wmem_alloc) == 0); if (filter) { - sk_filter_release(sk, filter); - sk->sk_filter = NULL; + sk_filter_uncharge(sk, filter); + RCU_INIT_POINTER(sk->sk_filter, NULL); } - sock_disable_timestamp(sk); + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); if (atomic_read(&sk->sk_omem_alloc)) - printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", - __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + pr_debug("%s: optmem leakage (%d bytes) detected\n", + __func__, atomic_read(&sk->sk_omem_alloc)); - security_sk_free(sk); - if (sk->sk_prot_creator->slab != NULL) - kmem_cache_free(sk->sk_prot_creator->slab, sk); - else - kfree(sk); - module_put(owner); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); + put_net(sock_net(sk)); + sk_prot_free(sk->sk_prot_creator, sk); +} + +void sk_free(struct sock *sk) +{ + /* + * We subtract one from sk_wmem_alloc and can know if + * some packets are still in some tx queue. + * If not null, sock_wfree() will call __sk_free(sk) later + */ + if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sk_free); + +/* + * Last sock_put should drop reference to sk->sk_net. It has already + * been dropped in sk_change_net. Taking reference to stopping namespace + * is not an option. + * Take reference to a socket to remove it from hash _alive_ and after that + * destroy it in the context of init_net. + */ +void sk_release_kernel(struct sock *sk) +{ + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_hold(sk); + sock_release(sk->sk_socket); + release_net(sock_net(sk)); + sock_net_set(sk, get_net(&init_net)); + sock_put(sk); +} +EXPORT_SYMBOL(sk_release_kernel); + +static void sk_update_clone(const struct sock *sk, struct sock *newsk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_update_memcg(newsk); } -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +/** + * sk_clone_lock - clone a socket, and lock its clone + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { - struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); + struct sock *newsk; + newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); if (newsk != NULL) { struct sk_filter *filter; - memcpy(newsk, sk, sk->sk_prot->obj_size); + sock_copy(newsk, sk); /* SANITY */ + get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); - atomic_set(&newsk->sk_wmem_alloc, 0); + /* + * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) + */ + atomic_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif - rwlock_init(&newsk->sk_dst_lock); + spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); + lockdep_set_class_and_name(&newsk->sk_callback_lock, + af_callback_keys + newsk->sk_family, + af_family_clock_key_strings[newsk->sk_family]); newsk->sk_dst_cache = NULL; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_send_head = NULL; - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - filter = newsk->sk_filter; + filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) sk_filter_charge(newsk, filter); @@ -746,6 +1528,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); sk_free(newsk); newsk = NULL; goto out; @@ -753,6 +1536,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&newsk->sk_refcnt, 2); /* @@ -767,80 +1555,138 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) * to be taken into account in all callers. -acme */ sk_refcnt_debug_inc(newsk); - newsk->sk_socket = NULL; - newsk->sk_sleep = NULL; + sk_set_socket(newsk, NULL); + newsk->sk_wq = NULL; + + sk_update_clone(sk, newsk); if (newsk->sk_prot->sockets_allocated) - atomic_inc(newsk->sk_prot->sockets_allocated); + sk_sockets_allocated_inc(newsk); + + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); } out: return newsk; } - -EXPORT_SYMBOL_GPL(sk_clone); - -void __init sk_init(void) -{ - if (num_physpages <= 4096) { - sysctl_wmem_max = 32767; - sysctl_rmem_max = 32767; - sysctl_wmem_default = 32767; - sysctl_rmem_default = 32767; - } else if (num_physpages >= 131072) { - sysctl_wmem_max = 131071; - sysctl_rmem_max = 131071; +EXPORT_SYMBOL_GPL(sk_clone_lock); + +void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (sk_can_gso(sk)) { + if (dst->header_len) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + } else { + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + sk->sk_gso_max_size = dst->dev->gso_max_size; + sk->sk_gso_max_segs = dst->dev->gso_max_segs; + } } } +EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ -/* - * Write buffer destructor automatically called from kfree_skb. +/* + * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; + unsigned int len = skb->truesize; - /* In case it might be waiting for more memory. */ - atomic_sub(skb->truesize, &sk->sk_wmem_alloc); - if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + /* + * Keep a reference on sk_wmem_alloc, this will be released + * after sk_write_space() call + */ + atomic_sub(len - 1, &sk->sk_wmem_alloc); sk->sk_write_space(sk); - sock_put(sk); + len = 1; + } + /* + * if sk_wmem_alloc reaches 0, we must finish what sk_free() + * could not do because of in-flight packets + */ + if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sock_wfree); + +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } } +EXPORT_SYMBOL(skb_orphan_partial); -/* - * Read buffer destructor automatically called from kfree_skb. +/* + * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; + unsigned int len = skb->truesize; - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + atomic_sub(len, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, len); } +EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; -int sock_i_uid(struct sock *sk) +#ifdef CONFIG_INET + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else +#endif + sock_put(sk); +} +EXPORT_SYMBOL(sock_edemux); + +kuid_t sock_i_uid(struct sock *sk) { - int uid; + kuid_t uid; - read_lock(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; - read_unlock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; + read_unlock_bh(&sk->sk_callback_lock); return uid; } +EXPORT_SYMBOL(sock_i_uid); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); return ino; } +EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. @@ -849,7 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - struct sk_buff * skb = alloc_skb(size, priority); + struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -857,33 +1703,18 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, } return NULL; } +EXPORT_SYMBOL(sock_wmalloc); /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - gfp_t priority) -{ - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - skb_set_owner_r(skb, sk); - return skb; - } - } - return NULL; -} - -/* * Allocate a memory block from the socket's option memory buffer. - */ + */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned)size <= sysctl_optmem_max && + if ((unsigned int)size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc - * might sleep. + * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); @@ -893,6 +1724,7 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } return NULL; } +EXPORT_SYMBOL(sock_kmalloc); /* * Free an option memory block. @@ -902,11 +1734,12 @@ void sock_kfree_s(struct sock *sk, void *mem, int size) kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } +EXPORT_SYMBOL(sock_kfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ -static long sock_wait_for_wmem(struct sock * sk, long timeo) +static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); @@ -917,7 +1750,7 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) break; if (sk->sk_shutdown & SEND_SHUTDOWN) @@ -926,7 +1759,7 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) break; timeo = schedule_timeout(timeo); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return timeo; } @@ -935,22 +1768,25 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) * Generic send/receive buffer handlers */ -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, int *errcode) +struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + unsigned long data_len, int noblock, + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; + err = -EMSGSIZE; + if (npages > MAX_SKB_FRAGS) + goto failure; timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -959,54 +1795,54 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, sk->sk_allocation); - if (skb) { - int npages; - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - skb_frag_t *frag; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1015,33 +1851,96 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } +EXPORT_SYMBOL(sock_alloc_send_pskb); -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); +} +EXPORT_SYMBOL(sock_alloc_send_skb); + +/* On 32bit arches, an skb frag is limited to 2^15 */ +#define SKB_FRAG_PAGE_ORDER get_order(32768) + +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) +{ + int order; + + if (pfrag->page) { + if (atomic_read(&pfrag->page->_count) == 1) { + pfrag->offset = 0; + return true; + } + if (pfrag->offset + sz <= pfrag->size) + return true; + put_page(pfrag->page); + } + + order = SKB_FRAG_PAGE_ORDER; + do { + gfp_t gfp = prio; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; + pfrag->page = alloc_pages(gfp, order); + if (likely(pfrag->page)) { + pfrag->offset = 0; + pfrag->size = PAGE_SIZE << order; + return true; + } + } while (--order >= 0); + + return false; } +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} +EXPORT_SYMBOL(sk_page_frag_refill); static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); - for(;;) { + for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); - if(!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { struct sk_buff *skb = sk->sk_backlog.head; @@ -1052,8 +1951,10 @@ static void __release_sock(struct sock *sk) do { struct sk_buff *next = skb->next; + prefetch(next); + WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); /* * We are in process context here with softirqs @@ -1067,7 +1968,13 @@ static void __release_sock(struct sock *sk) } while (skb != NULL); bh_lock_sock(sk); - } while((skb = sk->sk_backlog.head) != NULL); + } while ((skb = sk->sk_backlog.head) != NULL); + + /* + * Doing the zeroing here guarantee we can not loop forever + * while a wild producer attempts to flood us. + */ + sk->sk_backlog.len = 0; } /** @@ -1085,16 +1992,120 @@ int sk_wait_data(struct sock *sk, long *timeo) int rc; DEFINE_WAIT(wait); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return rc; } - EXPORT_SYMBOL(sk_wait_data); +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + struct proto *prot = sk->sk_prot; + int amt = sk_mem_pages(size); + long allocated; + int parent_status = UNDER_LIMIT; + + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + + allocated = sk_memory_allocated_add(sk, amt, &parent_status); + + /* Under limit. */ + if (parent_status == UNDER_LIMIT && + allocated <= sk_prot_mem_limits(sk, 0)) { + sk_leave_memory_pressure(sk); + return 1; + } + + /* Under pressure. (we or our parents) */ + if ((parent_status > SOFT_LIMIT) || + allocated > sk_prot_mem_limits(sk, 1)) + sk_enter_memory_pressure(sk); + + /* Over hard limit (we or our parents) */ + if ((parent_status == OVER_LIMIT) || + (allocated > sk_prot_mem_limits(sk, 2))) + goto suppress_allocation; + + /* guarantee minimum buffer size under pressure */ + if (kind == SK_MEM_RECV) { + if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + return 1; + + } else { /* SK_MEM_SEND */ + if (sk->sk_type == SOCK_STREAM) { + if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + return 1; + } else if (atomic_read(&sk->sk_wmem_alloc) < + prot->sysctl_wmem[0]) + return 1; + } + + if (sk_has_memory_pressure(sk)) { + int alloc; + + if (!sk_under_memory_pressure(sk)) + return 1; + alloc = sk_sockets_allocated_read_positive(sk); + if (sk_prot_mem_limits(sk, 2) > alloc * + sk_mem_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + } + +suppress_allocation: + + if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + trace_sock_exceed_buf_limit(sk, prot, allocated); + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; + + sk_memory_allocated_sub(sk, amt); + + return 0; +} +EXPORT_SYMBOL(__sk_mem_schedule); + +/** + * __sk_reclaim - reclaim memory_allocated + * @sk: socket + */ +void __sk_mem_reclaim(struct sock *sk) +{ + sk_memory_allocated_sub(sk, + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); + sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + + if (sk_under_memory_pressure(sk) && + (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) + sk_leave_memory_pressure(sk); +} +EXPORT_SYMBOL(__sk_mem_reclaim); + + /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain @@ -1106,78 +2117,92 @@ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_bind); -int sock_no_connect(struct socket *sock, struct sockaddr *saddr, +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_accept); -int sock_no_getname(struct socket *sock, struct sockaddr *saddr, +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int *len, int peer) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_getname); -unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) { return 0; } +EXPORT_SYMBOL(sock_no_poll); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_shutdown); int sock_no_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_setsockopt); int sock_no_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_getsockopt); int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } +EXPORT_SYMBOL(sock_no_mmap); ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { @@ -1191,6 +2216,7 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz kunmap(page); return res; } +EXPORT_SYMBOL(sock_no_sendpage); /* * Default Socket Callbacks @@ -1198,47 +2224,61 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz static void sock_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_all(sk->sk_sleep); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk,0,POLL_ERR); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk,1,POLL_IN); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | + POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_destruct(struct sock *sk) @@ -1250,8 +2290,9 @@ void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) - sk_wake_async(sk, 3, POLL_PRI); + sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } +EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) @@ -1259,15 +2300,13 @@ void sk_reset_timer(struct sock *sk, struct timer_list* timer, if (!mod_timer(timer, expires)) sock_hold(sk); } - EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (timer_pending(timer) && del_timer(timer)) + if (del_timer(timer)) __sock_put(sk); } - EXPORT_SYMBOL(sk_stop_timer); void sock_init_data(struct socket *sock, struct sock *sk) @@ -1275,29 +2314,34 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&sk->sk_async_wait_queue); +#endif sk->sk_send_head = NULL; init_timer(&sk->sk_timer); - + sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; - sk->sk_socket = sock; + sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); - if(sock) - { + if (sock) { sk->sk_type = sock->type; - sk->sk_sleep = &sock->wait; + sk->sk_wq = sock->wq; sock->sk = sk; } else - sk->sk_sleep = NULL; + sk->sk_wq = NULL; - rwlock_init(&sk->sk_dst_lock); + spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; @@ -1305,68 +2349,202 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + sk->sk_peek_off = -1; - sk->sk_peercred.pid = 0; - sk->sk_peercred.uid = -1; - sk->sk_peercred.gid = -1; + sk->sk_peer_pid = NULL; + sk->sk_peer_cred = NULL; sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp.tv_sec = -1L; - sk->sk_stamp.tv_usec = -1L; + sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_RX_BUSY_POLL + sk->sk_napi_id = 0; + sk->sk_ll_usec = sysctl_net_busy_read; +#endif + + sk->sk_max_pacing_rate = ~0U; + sk->sk_pacing_rate = ~0U; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_drops, 0); } +EXPORT_SYMBOL(sock_init_data); -void fastcall lock_sock(struct sock *sk) +void lock_sock_nested(struct sock *sk, int subclass) { might_sleep(); - spin_lock_bh(&(sk->sk_lock.slock)); - if (sk->sk_lock.owner) + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_lock.owned) __lock_sock(sk); - sk->sk_lock.owner = (void *)1; - spin_unlock_bh(&(sk->sk_lock.slock)); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + local_bh_enable(); } +EXPORT_SYMBOL(lock_sock_nested); -EXPORT_SYMBOL(lock_sock); - -void fastcall release_sock(struct sock *sk) +void release_sock(struct sock *sk) { - spin_lock_bh(&(sk->sk_lock.slock)); + /* + * The sk_lock has mutex_unlock() semantics: + */ + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + + spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); - sk->sk_lock.owner = NULL; - if (waitqueue_active(&(sk->sk_lock.wq))) - wake_up(&(sk->sk_lock.wq)); - spin_unlock_bh(&(sk->sk_lock.slock)); + + /* Warning : release_cb() might need to release sk ownership, + * ie call sock_release_ownership(sk) before us. + */ + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + + sock_release_ownership(sk); + if (waitqueue_active(&sk->sk_lock.wq)) + wake_up(&sk->sk_lock.wq); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken + * sk_lock.slock locked, owned = 0, BH disabled + * return true if slow path is taken + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +bool lock_sock_fast(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + + if (!sk->sk_lock.owned) + /* + * Note : We must disable BH + */ + return false; + + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + local_bh_enable(); + return true; +} +EXPORT_SYMBOL(lock_sock_fast); + int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) -{ +{ + struct timeval tv; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); - if (sk->sk_stamp.tv_sec == -1) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + tv = ktime_to_timeval(sk->sk_stamp); + if (tv.tv_sec == -1) return -ENOENT; - if (sk->sk_stamp.tv_sec == 0) - do_gettimeofday(&sk->sk_stamp); - return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ? - -EFAULT : 0; -} + if (tv.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + tv = ktime_to_timeval(sk->sk_stamp); + } + return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; +} EXPORT_SYMBOL(sock_get_timestamp); -void sock_enable_timestamp(struct sock *sk) -{ - if (!sock_flag(sk, SOCK_TIMESTAMP)) { - sock_set_flag(sk, SOCK_TIMESTAMP); - net_enable_timestamp(); +int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) +{ + struct timespec ts; + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + ts = ktime_to_timespec(sk->sk_stamp); + if (ts.tv_sec == -1) + return -ENOENT; + if (ts.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + ts = ktime_to_timespec(sk->sk_stamp); + } + return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestampns); + +void sock_enable_timestamp(struct sock *sk, int flag) +{ + if (!sock_flag(sk, flag)) { + unsigned long previous_flags = sk->sk_flags; + + sock_set_flag(sk, flag); + /* + * we just set one of the two flags which require net + * time stamping, but time stamping might have been on + * already because of the other one + */ + if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + net_enable_timestamp(); } } -EXPORT_SYMBOL(sock_enable_timestamp); + +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. @@ -1382,7 +2560,6 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL(sock_common_getsockopt); #ifdef CONFIG_COMPAT @@ -1391,7 +2568,7 @@ int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - if (sk->sk_prot->compat_setsockopt != NULL) + if (sk->sk_prot->compat_getsockopt != NULL) return sk->sk_prot->compat_getsockopt(sk, level, optname, optval, optlen); return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); @@ -1412,25 +2589,23 @@ int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = addr_len; return err; } - EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL(sock_common_setsockopt); #ifdef CONFIG_COMPAT int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -1474,96 +2649,194 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + sock_put(sk); } - EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); +#ifdef CONFIG_PROC_FS +#define PROTO_INUSE_NR 64 /* should be enough for the first time */ +struct prot_inuse { + int val[PROTO_INUSE_NR]; +}; -int proto_register(struct proto *prot, int alloc_slab) +static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); + +#ifdef CONFIG_NET_NS +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); + +static int __net_init sock_inuse_init_net(struct net *net) +{ + net->core.inuse = alloc_percpu(struct prot_inuse); + return net->core.inuse ? 0 : -ENOMEM; +} + +static void __net_exit sock_inuse_exit_net(struct net *net) +{ + free_percpu(net->core.inuse); +} + +static struct pernet_operations net_inuse_ops = { + .init = sock_inuse_init_net, + .exit = sock_inuse_exit_net, +}; + +static __init int net_inuse_init(void) +{ + if (register_pernet_subsys(&net_inuse_ops)) + panic("Cannot initialize net inuse counters"); + + return 0; +} + +core_initcall(net_inuse_init); +#else +static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); + +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu(prot_inuse, cpu).val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +#endif + +static void assign_proto_idx(struct proto *prot) { - char *request_sock_slab_name = NULL; - char *timewait_sock_slab_name; - int rc = -ENOBUFS; + prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + pr_err("PROTO_INUSE_NR exhausted\n"); + return; + } + + set_bit(prot->inuse_idx, proto_inuse_idx); +} + +static void release_proto_idx(struct proto *prot) +{ + if (prot->inuse_idx != PROTO_INUSE_NR - 1) + clear_bit(prot->inuse_idx, proto_inuse_idx); +} +#else +static inline void assign_proto_idx(struct proto *prot) +{ +} + +static inline void release_proto_idx(struct proto *prot) +{ +} +#endif + +int proto_register(struct proto *prot, int alloc_slab) +{ if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN | prot->slab_flags, + NULL); if (prot->slab == NULL) { - printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", - prot->name); + pr_crit("%s: Can't create sock SLAB cache!\n", + prot->name); goto out; } if (prot->rsk_prot != NULL) { - static const char mask[] = "request_sock_%s"; - - request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); - if (request_sock_slab_name == NULL) + prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); + if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab; - sprintf(request_sock_slab_name, mask, prot->name); - prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, + prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN, NULL); if (prot->rsk_prot->slab == NULL) { - printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", - prot->name); + pr_crit("%s: Can't create request sock SLAB cache!\n", + prot->name); goto out_free_request_sock_slab_name; } } if (prot->twsk_prot != NULL) { - static const char mask[] = "tw_sock_%s"; - - timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); - if (timewait_sock_slab_name == NULL) + if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; - sprintf(timewait_sock_slab_name, mask, prot->name); prot->twsk_prot->twsk_slab = - kmem_cache_create(timewait_sock_slab_name, + kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); + 0, + SLAB_HWCACHE_ALIGN | + prot->slab_flags, + NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } - write_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); list_add(&prot->node, &proto_list); - write_unlock(&proto_list_lock); - rc = 0; -out: - return rc; + assign_proto_idx(prot); + mutex_unlock(&proto_list_mutex); + return 0; + out_free_timewait_sock_slab_name: - kfree(timewait_sock_slab_name); + kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: if (prot->rsk_prot && prot->rsk_prot->slab) { kmem_cache_destroy(prot->rsk_prot->slab); prot->rsk_prot->slab = NULL; } out_free_request_sock_slab_name: - kfree(request_sock_slab_name); + if (prot->rsk_prot) + kfree(prot->rsk_prot->slab_name); out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; - goto out; +out: + return -ENOBUFS; } - EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { - write_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); + release_proto_idx(prot); list_del(&prot->node); - write_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); if (prot->slab != NULL) { kmem_cache_destroy(prot->slab); @@ -1571,86 +2844,63 @@ void proto_unregister(struct proto *prot) } if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { - const char *name = kmem_cache_name(prot->rsk_prot->slab); - kmem_cache_destroy(prot->rsk_prot->slab); - kfree(name); + kfree(prot->rsk_prot->slab_name); prot->rsk_prot->slab = NULL; } if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_prot->twsk_slab); - kfree(name); + kfree(prot->twsk_prot->twsk_slab_name); prot->twsk_prot->twsk_slab = NULL; } } - EXPORT_SYMBOL(proto_unregister); #ifdef CONFIG_PROC_FS -static inline struct proto *__proto_head(void) -{ - return list_entry(proto_list.next, struct proto, node); -} - -static inline struct proto *proto_head(void) -{ - return list_empty(&proto_list) ? NULL : __proto_head(); -} - -static inline struct proto *proto_next(struct proto *proto) -{ - return proto->node.next == &proto_list ? NULL : - list_entry(proto->node.next, struct proto, node); -} - -static inline struct proto *proto_get_idx(loff_t pos) -{ - struct proto *proto; - loff_t i = 0; - - list_for_each_entry(proto, &proto_list, node) - if (i++ == pos) - goto out; - - proto = NULL; -out: - return proto; -} - static void *proto_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(proto_list_mutex) { - read_lock(&proto_list_lock); - return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN; + mutex_lock(&proto_list_mutex); + return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - ++*pos; - return v == SEQ_START_TOKEN ? proto_head() : proto_next(v); + return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) + __releases(proto_list_mutex) { - read_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } +static long sock_prot_memory_allocated(struct proto *proto) +{ + return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ + return proto->memory_pressure != NULL ? + proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +} static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, - proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1, - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, - proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + sock_prot_inuse_get(seq_file_net(seq), proto), + sock_prot_memory_allocated(proto), + sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), @@ -1677,7 +2927,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) static int proto_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) + if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", @@ -1689,11 +2939,11 @@ static int proto_seq_show(struct seq_file *seq, void *v) "module", "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); else - proto_seq_printf(seq, v); + proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } -static struct seq_operations proto_seq_ops = { +static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, @@ -1702,57 +2952,42 @@ static struct seq_operations proto_seq_ops = { static int proto_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &proto_seq_ops); + return seq_open_net(inode, file, &proto_seq_ops, + sizeof(struct seq_net_private)); } -static struct file_operations proto_seq_fops = { +static const struct file_operations proto_seq_fops = { .owner = THIS_MODULE, .open = proto_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, +}; + +static __net_init int proto_init_net(struct net *net) +{ + if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) + return -ENOMEM; + + return 0; +} + +static __net_exit void proto_exit_net(struct net *net) +{ + remove_proc_entry("protocols", net->proc_net); +} + + +static __net_initdata struct pernet_operations proto_net_ops = { + .init = proto_init_net, + .exit = proto_exit_net, }; static int __init proto_init(void) { - /* register /proc/net/protocols */ - return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; + return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ - -EXPORT_SYMBOL(sk_alloc); -EXPORT_SYMBOL(sk_free); -EXPORT_SYMBOL(sk_send_sigurg); -EXPORT_SYMBOL(sock_alloc_send_skb); -EXPORT_SYMBOL(sock_init_data); -EXPORT_SYMBOL(sock_kfree_s); -EXPORT_SYMBOL(sock_kmalloc); -EXPORT_SYMBOL(sock_no_accept); -EXPORT_SYMBOL(sock_no_bind); -EXPORT_SYMBOL(sock_no_connect); -EXPORT_SYMBOL(sock_no_getname); -EXPORT_SYMBOL(sock_no_getsockopt); -EXPORT_SYMBOL(sock_no_ioctl); -EXPORT_SYMBOL(sock_no_listen); -EXPORT_SYMBOL(sock_no_mmap); -EXPORT_SYMBOL(sock_no_poll); -EXPORT_SYMBOL(sock_no_recvmsg); -EXPORT_SYMBOL(sock_no_sendmsg); -EXPORT_SYMBOL(sock_no_sendpage); -EXPORT_SYMBOL(sock_no_setsockopt); -EXPORT_SYMBOL(sock_no_shutdown); -EXPORT_SYMBOL(sock_no_socketpair); -EXPORT_SYMBOL(sock_rfree); -EXPORT_SYMBOL(sock_setsockopt); -EXPORT_SYMBOL(sock_wfree); -EXPORT_SYMBOL(sock_wmalloc); -EXPORT_SYMBOL(sock_i_uid); -EXPORT_SYMBOL(sock_i_ino); -EXPORT_SYMBOL(sysctl_optmem_max); -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_rmem_max); -EXPORT_SYMBOL(sysctl_wmem_max); -#endif |
