aboutsummaryrefslogtreecommitdiff
path: root/net/core/sock.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/sock.c')
-rw-r--r--net/core/sock.c976
1 files changed, 709 insertions, 267 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index fb608011146..026e01f7027 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -89,8 +89,11 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/errno.h>
+#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
@@ -111,9 +114,11 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
+#include <linux/static_key.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
@@ -125,13 +130,107 @@
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
#include <linux/filter.h>
+#include <trace/events/sock.h>
+
#ifdef CONFIG_INET
#include <net/tcp.h>
#endif
+#include <net/busy_poll.h>
+
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capbility to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socke was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+
+#ifdef CONFIG_MEMCG_KMEM
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+ int ret = 0;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry(proto, &proto_list, node) {
+ if (proto->init_cgroup) {
+ ret = proto->init_cgroup(memcg, ss);
+ if (ret)
+ goto out;
+ }
+ }
+
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+out:
+ list_for_each_entry_continue_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+}
+
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
+{
+ struct proto *proto;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+}
+#endif
+
/*
* Each address family might have different locking rules, so we have
* one slock key per address family:
@@ -139,6 +238,11 @@
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+#if defined(CONFIG_MEMCG_KMEM)
+struct static_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+#endif
+
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
@@ -157,8 +261,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
"sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
"sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
- "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" ,
- "sk_lock-AF_MAX"
+ "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
@@ -173,8 +277,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-27" , "slock-28" , "slock-AF_CAN" ,
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
"slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
- "slock-AF_IEEE802154", "slock-AF_CAIF" ,
- "slock-AF_MAX"
+ "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
@@ -189,8 +293,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-27" , "clock-28" , "clock-AF_CAN" ,
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
"clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
- "clock-AF_IEEE802154", "clock-AF_CAIF" ,
- "clock-AF_MAX"
+ "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
};
/*
@@ -205,24 +309,76 @@ static struct lock_class_key af_callback_keys[AF_MAX];
* not depend upon such differences.
*/
#define _SK_MEM_PACKETS 256
-#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
+#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-/* Maximal space eaten by iovec or ancilliary data plus some space */
+/* Maximal space eaten by iovec or ancillary data plus some space */
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
-#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
-int net_cls_subsys_id = -1;
-EXPORT_SYMBOL_GPL(net_cls_subsys_id);
-#endif
+struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(memalloc_socks);
+
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_key_slow_inc(&memalloc_socks);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_key_slow_dec(&memalloc_socks);
+
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. However, if SOCK_MEMALLOC is cleared while
+ * it has rmem allocations there is a risk that the user of the
+ * socket cannot make forward progress due to exceeding the rmem
+ * limits. By rights, sk_clear_memalloc() should only be called
+ * on sockets being torn down but warn and reset the accounting if
+ * that assumption breaks.
+ */
+ if (WARN_ON(sk->sk_forward_alloc))
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned long pflags = current->flags;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ current->flags |= PF_MEMALLOC;
+ ret = sk->sk_backlog_rcv(sk, skb);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
@@ -241,9 +397,8 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
*timeo_p = 0;
if (warned < 10 && net_ratelimit()) {
warned++;
- printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
- "tries to set negative timeout\n",
- current->comm, task_pid_nr(current));
+ pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
+ __func__, current->comm, task_pid_nr(current));
}
return 0;
}
@@ -261,20 +416,20 @@ static void sock_warn_obsolete_bsdism(const char *name)
static char warncomm[TASK_COMM_LEN];
if (strcmp(warncomm, current->comm) && warned < 5) {
strcpy(warncomm, current->comm);
- printk(KERN_WARNING "process `%s' is using obsolete "
- "%s SO_BSDCOMPAT\n", warncomm, name);
+ pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
+ warncomm, name);
warned++;
}
}
-static void sock_disable_timestamp(struct sock *sk, int flag)
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
- if (sock_flag(sk, flag)) {
- sock_reset_flag(sk, flag);
- if (!sock_flag(sk, SOCK_TIMESTAMP) &&
- !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
net_disable_timestamp();
- }
}
}
@@ -286,12 +441,9 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
+ trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
@@ -299,7 +451,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (err)
return err;
- if (!sk_rmem_schedule(sk, skb->truesize)) {
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
}
@@ -325,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_rcv_skb);
@@ -339,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, skb)) {
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@@ -356,7 +508,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
rc = sk_backlog_rcv(sk, skb);
mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else if (sk_add_backlog(sk, skb)) {
+ } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
@@ -372,19 +524,13 @@ discard_and_relse:
}
EXPORT_SYMBOL(sk_receive_skb);
-void sk_reset_txq(struct sock *sk)
-{
- sk_tx_queue_clear(sk);
-}
-EXPORT_SYMBOL(sk_reset_txq);
-
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- rcu_assign_pointer(sk->sk_dst_cache, NULL);
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
}
@@ -407,7 +553,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+ int optlen)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -417,7 +564,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
/* Sorry... */
ret = -EPERM;
- if (!capable(CAP_NET_RAW))
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
@@ -464,6 +611,46 @@ out:
return ret;
}
+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+ int __user *optlen, int len)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+
+ if (sk->sk_bound_dev_if == 0) {
+ len = 0;
+ goto zero;
+ }
+
+ ret = -EINVAL;
+ if (len < IFNAMSIZ)
+ goto out;
+
+ ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ if (ret)
+ goto out;
+
+ len = strlen(devname) + 1;
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, devname, len))
+ goto out;
+
+zero:
+ ret = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
{
if (valbool)
@@ -491,7 +678,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
*/
if (optname == SO_BINDTODEVICE)
- return sock_bindtodevice(sk, optval, optlen);
+ return sock_setbindtodevice(sk, optval, optlen);
if (optlen < sizeof(int))
return -EINVAL;
@@ -511,7 +698,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_valbool_flag(sk, SOCK_DBG, valbool);
break;
case SO_REUSEADDR:
- sk->sk_reuse = valbool;
+ sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ break;
+ case SO_REUSEPORT:
+ sk->sk_reuseport = valbool;
break;
case SO_TYPE:
case SO_PROTOCOL:
@@ -527,23 +717,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
break;
case SO_SNDBUF:
/* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_wmem_max)
- val = sysctl_wmem_max;
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- if ((val * 2) < SOCK_MIN_SNDBUF)
- sk->sk_sndbuf = SOCK_MIN_SNDBUF;
- else
- sk->sk_sndbuf = val * 2;
-
- /*
- * Wake up sending tasks if we
- * upped the value.
- */
+ sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ /* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -556,12 +738,11 @@ set_sndbuf:
case SO_RCVBUF:
/* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_rmem_max)
- val = sysctl_rmem_max;
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_rmem_max);
set_rcvbuf:
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
/*
@@ -579,10 +760,7 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- if ((val * 2) < SOCK_MIN_RCVBUF)
- sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
- else
- sk->sk_rcvbuf = val * 2;
+ sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
break;
case SO_RCVBUFFORCE:
@@ -594,7 +772,8 @@ set_rcvbuf:
case SO_KEEPALIVE:
#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP)
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
tcp_set_keepalive(sk, valbool);
#endif
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
@@ -605,11 +784,12 @@ set_rcvbuf:
break;
case SO_NO_CHECK:
- sk->sk_no_check = valbool;
+ sk->sk_no_check_tx = valbool;
break;
case SO_PRIORITY:
- if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+ if ((val >= 0 && val <= 6) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -679,7 +859,7 @@ set_rcvbuf:
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
val & SOF_TIMESTAMPING_SOFTWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
@@ -719,6 +899,13 @@ set_rcvbuf:
ret = sk_detach_filter(sk);
break;
+ case SO_LOCK_FILTER:
+ if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
+ ret = -EPERM;
+ else
+ sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
+ break;
+
case SO_PASSSEC:
if (valbool)
set_bit(SOCK_PASSSEC, &sock->flags);
@@ -726,7 +913,7 @@ set_rcvbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
ret = -EPERM;
else
sk->sk_mark = val;
@@ -735,11 +922,48 @@ set_rcvbuf:
/* We implement the SO_SNDLOWAT etc to
not be settable (1003.1g 5.3) */
case SO_RXQ_OVFL:
- if (valbool)
- sock_set_flag(sk, SOCK_RXQ_OVFL);
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
+ break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
+ case SO_PEEK_OFF:
+ if (sock->ops->set_peek_off)
+ ret = sock->ops->set_peek_off(sk, val);
else
- sock_reset_flag(sk, SOCK_RXQ_OVFL);
+ ret = -EOPNOTSUPP;
break;
+
+ case SO_NOFCS:
+ sock_valbool_flag(sk, SOCK_NOFCS, valbool);
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ /* allow unprivileged users to decrease the value */
+ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else {
+ if (val < 0)
+ ret = -EINVAL;
+ else
+ sk->sk_ll_usec = val;
+ }
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -750,19 +974,18 @@ set_rcvbuf:
EXPORT_SYMBOL(sock_setsockopt);
-void cred_to_ucred(struct pid *pid, const struct cred *cred,
- struct ucred *ucred)
+static void cred_to_ucred(struct pid *pid, const struct cred *cred,
+ struct ucred *ucred)
{
ucred->pid = pid_vnr(pid);
ucred->uid = ucred->gid = -1;
if (cred) {
struct user_namespace *current_ns = current_user_ns();
- ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
- ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
}
}
-EXPORT_SYMBOL_GPL(cred_to_ucred);
int sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
@@ -795,7 +1018,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BROADCAST:
- v.val = !!sock_flag(sk, SOCK_BROADCAST);
+ v.val = sock_flag(sk, SOCK_BROADCAST);
break;
case SO_SNDBUF:
@@ -810,8 +1033,12 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reuse;
break;
+ case SO_REUSEPORT:
+ v.val = sk->sk_reuseport;
+ break;
+
case SO_KEEPALIVE:
- v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
+ v.val = sock_flag(sk, SOCK_KEEPOPEN);
break;
case SO_TYPE:
@@ -833,11 +1060,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_OOBINLINE:
- v.val = !!sock_flag(sk, SOCK_URGINLINE);
+ v.val = sock_flag(sk, SOCK_URGINLINE);
break;
case SO_NO_CHECK:
- v.val = sk->sk_no_check;
+ v.val = sk->sk_no_check_tx;
break;
case SO_PRIORITY:
@@ -846,7 +1073,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_LINGER:
lv = sizeof(v.ling);
- v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
+ v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
v.ling.l_linger = sk->sk_lingertime / HZ;
break;
@@ -912,7 +1139,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PASSCRED:
- v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
+ v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
break;
case SO_PEERCRED:
@@ -947,7 +1174,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PASSSEC:
- v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
+ v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_PEERSEC:
@@ -958,7 +1185,53 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_RXQ_OVFL:
- v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
+ v.val = sock_flag(sk, SOCK_RXQ_OVFL);
+ break;
+
+ case SO_WIFI_STATUS:
+ v.val = sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
+ case SO_PEEK_OFF:
+ if (!sock->ops->set_peek_off)
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_peek_off;
+ break;
+ case SO_NOFCS:
+ v.val = sock_flag(sk, SOCK_NOFCS);
+ break;
+
+ case SO_BINDTODEVICE:
+ return sock_getbindtodevice(sk, optval, optlen, len);
+
+ case SO_GET_FILTER:
+ len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+
+ case SO_LOCK_FILTER:
+ v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
+ break;
+
+ case SO_BPF_EXTENSIONS:
+ v.val = bpf_tell_extensions();
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ v.val = sk->sk_ll_usec;
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ v.val = sk->sk_max_pacing_rate;
break;
default:
@@ -992,23 +1265,42 @@ static inline void sock_lock_init(struct sock *sk)
/*
* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
* even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
- BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
- sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
- sizeof(osk->sk_tx_queue_mapping));
- memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+ memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
security_sk_clone(osk, nsk);
#endif
}
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+ unsigned long nulls1, nulls2;
+
+ nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+ nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+ if (nulls1 > nulls2)
+ swap(nulls1, nulls2);
+
+ if (nulls1 != 0)
+ memset((char *)sk, 0, nulls1);
+ memset((char *)sk + nulls1 + sizeof(void *), 0,
+ nulls2 - nulls1 - sizeof(void *));
+ memset((char *)sk + nulls2 + sizeof(void *), 0,
+ size - nulls2 - sizeof(void *));
+}
+EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
+
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
@@ -1021,19 +1313,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!sk)
return sk;
if (priority & __GFP_ZERO) {
- /*
- * caches using SLAB_DESTROY_BY_RCU should let
- * sk_node.next un-modified. Special care is taken
- * when initializing object to zero.
- */
- if (offsetof(struct sock, sk_node.next) != 0)
- memset(sk, 0, offsetof(struct sock, sk_node.next));
- memset(&sk->sk_node.pprev, 0,
- prot->obj_size - offsetof(struct sock,
- sk_node.pprev));
+ if (prot->clear_sk)
+ prot->clear_sk(sk, prot->obj_size);
+ else
+ sk_prot_clear_nulls(sk, prot->obj_size);
}
- }
- else
+ } else
sk = kmalloc(prot->obj_size, priority);
if (sk != NULL) {
@@ -1075,18 +1360,15 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
module_put(owner);
}
-#ifdef CONFIG_CGROUPS
-void sock_update_classid(struct sock *sk)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+void sock_update_netprioidx(struct sock *sk)
{
- u32 classid;
+ if (in_interrupt())
+ return;
- rcu_read_lock(); /* doing current task, which cannot vanish. */
- classid = task_cls_classid(current);
- rcu_read_unlock();
- if (classid && classid != sk->sk_classid)
- sk->sk_classid = classid;
+ sk->sk_cgrp_prioidx = task_netprioidx(current);
}
-EXPORT_SYMBOL(sock_update_classid);
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif
/**
@@ -1114,6 +1396,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
+ sock_update_netprioidx(sk);
}
return sk;
@@ -1131,15 +1414,14 @@ static void __sk_free(struct sock *sk)
atomic_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
- rcu_assign_pointer(sk->sk_filter, NULL);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk, SOCK_TIMESTAMP);
- sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
if (atomic_read(&sk->sk_omem_alloc))
- printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
- __func__, atomic_read(&sk->sk_omem_alloc));
+ pr_debug("%s: optmem leakage (%d bytes) detected\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
@@ -1151,7 +1433,7 @@ static void __sk_free(struct sock *sk)
void sk_free(struct sock *sk)
{
/*
- * We substract one from sk_wmem_alloc and can know if
+ * We subtract one from sk_wmem_alloc and can know if
* some packets are still in some tx queue.
* If not null, sock_wfree() will call __sk_free(sk) later
*/
@@ -1161,10 +1443,10 @@ void sk_free(struct sock *sk)
EXPORT_SYMBOL(sk_free);
/*
- * Last sock_put should drop referrence to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking referrence to stopping namespace
+ * Last sock_put should drop reference to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking reference to stopping namespace
* is not an option.
- * Take referrence to a socket to remove it from hash _alive_ and after that
+ * Take reference to a socket to remove it from hash _alive_ and after that
* destroy it in the context of init_net.
*/
void sk_release_kernel(struct sock *sk)
@@ -1180,7 +1462,20 @@ void sk_release_kernel(struct sock *sk)
}
EXPORT_SYMBOL(sk_release_kernel);
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+static void sk_update_clone(const struct sock *sk, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+ sock_update_memcg(newsk);
+}
+
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
@@ -1233,6 +1528,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
sk_free(newsk);
newsk = NULL;
goto out;
@@ -1262,17 +1558,18 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
+ sk_update_clone(sk, newsk);
+
if (newsk->sk_prot->sockets_allocated)
- percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+ sk_sockets_allocated_inc(newsk);
- if (sock_flag(newsk, SOCK_TIMESTAMP) ||
- sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
}
out:
return newsk;
}
-EXPORT_SYMBOL_GPL(sk_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
@@ -1287,24 +1584,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
+ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
}
}
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
-void __init sk_init(void)
-{
- if (totalram_pages <= 4096) {
- sysctl_wmem_max = 32767;
- sysctl_rmem_max = 32767;
- sysctl_wmem_default = 32767;
- sysctl_rmem_default = 32767;
- } else if (totalram_pages >= 131072) {
- sysctl_wmem_max = 131071;
- sysctl_rmem_max = 131071;
- }
-}
-
/*
* Simple resource managers for sockets.
*/
@@ -1336,6 +1621,25 @@ void sock_wfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_wfree);
+void skb_orphan_partial(struct sk_buff *skb)
+{
+ /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
+ * so we do not completely orphan skb, but transfert all
+ * accounted bytes but one, to avoid unexpected reorders.
+ */
+ if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+ || skb->destructor == tcp_wfree
+#endif
+ ) {
+ atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
+ skb->truesize = 1;
+ } else {
+ skb_orphan(skb);
+ }
+}
+EXPORT_SYMBOL(skb_orphan_partial);
+
/*
* Read buffer destructor automatically called from kfree_skb.
*/
@@ -1349,13 +1653,25 @@ void sock_rfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_rfree);
+void sock_edemux(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
-int sock_i_uid(struct sock *sk)
+#ifdef CONFIG_INET
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put(inet_twsk(sk));
+ else
+#endif
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
+kuid_t sock_i_uid(struct sock *sk)
{
- int uid;
+ kuid_t uid;
read_lock_bh(&sk->sk_callback_lock);
- uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
read_unlock_bh(&sk->sk_callback_lock);
return uid;
}
@@ -1390,27 +1706,11 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
EXPORT_SYMBOL(sock_wmalloc);
/*
- * Allocate a skb from the socket's receive buffer.
- */
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
- gfp_t priority)
-{
- if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
- struct sk_buff *skb = alloc_skb(size, priority);
- if (skb) {
- skb_set_owner_r(skb, sk);
- return skb;
- }
- }
- return NULL;
-}
-
-/*
* Allocate a memory block from the socket's option memory buffer.
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned)size <= sysctl_optmem_max &&
+ if ((unsigned int)size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
@@ -1470,19 +1770,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
- int *errcode)
+ int *errcode, int max_page_order)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ unsigned long chunk;
gfp_t gfp_mask;
long timeo;
int err;
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ struct page *page;
+ int i;
- gfp_mask = sk->sk_allocation;
- if (gfp_mask & __GFP_WAIT)
- gfp_mask |= __GFP_REPEAT;
+ err = -EMSGSIZE;
+ if (npages > MAX_SKB_FRAGS)
+ goto failure;
timeo = sock_sndtimeo(sk, noblock);
- while (1) {
+ while (!skb) {
err = sock_error(sk);
if (err != 0)
goto failure;
@@ -1491,54 +1795,54 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- skb = alloc_skb(header_len, gfp_mask);
- if (skb) {
- int npages;
- int i;
-
- /* No pages, we're done... */
- if (!data_len)
- break;
-
- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- skb->truesize += data_len;
- skb_shinfo(skb)->nr_frags = npages;
- for (i = 0; i < npages; i++) {
- struct page *page;
- skb_frag_t *frag;
-
- page = alloc_pages(sk->sk_allocation, 0);
- if (!page) {
- err = -ENOBUFS;
- skb_shinfo(skb)->nr_frags = i;
- kfree_skb(skb);
- goto failure;
- }
-
- frag = &skb_shinfo(skb)->frags[i];
- frag->page = page;
- frag->page_offset = 0;
- frag->size = (data_len >= PAGE_SIZE ?
- PAGE_SIZE :
- data_len);
- data_len -= PAGE_SIZE;
- }
+ if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
+ continue;
+ }
- /* Full success... */
- break;
- }
- err = -ENOBUFS;
+ err = -ENOBUFS;
+ gfp_mask = sk->sk_allocation;
+ if (gfp_mask & __GFP_WAIT)
+ gfp_mask |= __GFP_REPEAT;
+
+ skb = alloc_skb(header_len, gfp_mask);
+ if (!skb)
goto failure;
+
+ skb->truesize += data_len;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages(sk->sk_allocation |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ }
+ order--;
+ }
+ page = alloc_page(sk->sk_allocation);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
}
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = -EAGAIN;
- if (!timeo)
- goto failure;
- if (signal_pending(current))
- goto interrupted;
- timeo = sock_wait_for_wmem(sk, timeo);
}
skb_set_owner_w(skb, sk);
@@ -1547,6 +1851,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
interrupted:
err = sock_intr_errno(timeo);
failure:
+ kfree_skb(skb);
*errcode = err;
return NULL;
}
@@ -1555,10 +1860,66 @@ EXPORT_SYMBOL(sock_alloc_send_pskb);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
int noblock, int *errcode)
{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+/* On 32bit arches, an skb frag is limited to 2^15 */
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+
+/**
+ * skb_page_frag_refill - check that a page_frag contains enough room
+ * @sz: minimum size of the fragment we want to get
+ * @pfrag: pointer to page_frag
+ * @prio: priority for memory allocation
+ *
+ * Note: While this allocator tries to use high order pages, there is
+ * no guarantee that allocations succeed. Therefore, @sz MUST be
+ * less or equal than PAGE_SIZE.
+ */
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+{
+ int order;
+
+ if (pfrag->page) {
+ if (atomic_read(&pfrag->page->_count) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ order = SKB_FRAG_PAGE_ORDER;
+ do {
+ gfp_t gfp = prio;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+ pfrag->page = alloc_pages(gfp, order);
+ if (likely(pfrag->page)) {
+ pfrag->offset = 0;
+ pfrag->size = PAGE_SIZE << order;
+ return true;
+ }
+ } while (--order >= 0);
+
+ return false;
+}
+EXPORT_SYMBOL(skb_page_frag_refill);
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+ return true;
+
+ sk_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+EXPORT_SYMBOL(sk_page_frag_refill);
+
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
@@ -1590,6 +1951,7 @@ static void __release_sock(struct sock *sk)
do {
struct sk_buff *next = skb->next;
+ prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
@@ -1654,30 +2016,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
struct proto *prot = sk->sk_prot;
int amt = sk_mem_pages(size);
long allocated;
+ int parent_status = UNDER_LIMIT;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+ allocated = sk_memory_allocated_add(sk, amt, &parent_status);
/* Under limit. */
- if (allocated <= prot->sysctl_mem[0]) {
- if (prot->memory_pressure && *prot->memory_pressure)
- *prot->memory_pressure = 0;
+ if (parent_status == UNDER_LIMIT &&
+ allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
return 1;
}
- /* Under pressure. */
- if (allocated > prot->sysctl_mem[1])
- if (prot->enter_memory_pressure)
- prot->enter_memory_pressure(sk);
+ /* Under pressure. (we or our parents) */
+ if ((parent_status > SOFT_LIMIT) ||
+ allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
- /* Over hard limit. */
- if (allocated > prot->sysctl_mem[2])
+ /* Over hard limit (we or our parents) */
+ if ((parent_status == OVER_LIMIT) ||
+ (allocated > sk_prot_mem_limits(sk, 2)))
goto suppress_allocation;
/* guarantee minimum buffer size under pressure */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
return 1;
+
} else { /* SK_MEM_SEND */
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1687,13 +2053,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
return 1;
}
- if (prot->memory_pressure) {
+ if (sk_has_memory_pressure(sk)) {
int alloc;
- if (!*prot->memory_pressure)
+ if (!sk_under_memory_pressure(sk))
return 1;
- alloc = percpu_counter_read_positive(prot->sockets_allocated);
- if (prot->sysctl_mem[2] > alloc *
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
atomic_read(&sk->sk_rmem_alloc) +
sk->sk_forward_alloc))
@@ -1712,9 +2078,13 @@ suppress_allocation:
return 1;
}
+ trace_sock_exceed_buf_limit(sk, prot, allocated);
+
/* Alas. Undo changes. */
sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
- atomic_long_sub(amt, prot->memory_allocated);
+
+ sk_memory_allocated_sub(sk, amt);
+
return 0;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1725,15 +2095,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reclaim(struct sock *sk)
{
- struct proto *prot = sk->sk_prot;
-
- atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
- prot->memory_allocated);
+ sk_memory_allocated_sub(sk,
+ sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
- if (prot->memory_pressure && *prot->memory_pressure &&
- (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
- *prot->memory_pressure = 0;
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -1877,14 +2245,14 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
- wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
@@ -1936,7 +2304,7 @@ EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (timer_pending(timer) && del_timer(timer))
+ if (del_timer(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);
@@ -1981,8 +2349,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ sk->sk_peek_off = -1;
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
@@ -1993,6 +2362,13 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_stamp = ktime_set(-1L, 0);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ sk->sk_napi_id = 0;
+ sk->sk_ll_usec = sysctl_net_busy_read;
+#endif
+
+ sk->sk_max_pacing_rate = ~0U;
+ sk->sk_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -2029,7 +2405,14 @@ void release_sock(struct sock *sk)
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
- sk->sk_lock.owned = 0;
+
+ /* Warning : release_cb() might need to release sk ownership,
+ * ie call sock_release_ownership(sk) before us.
+ */
+ if (sk->sk_prot->release_cb)
+ sk->sk_prot->release_cb(sk);
+
+ sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
@@ -2104,20 +2487,65 @@ EXPORT_SYMBOL(sock_get_timestampns);
void sock_enable_timestamp(struct sock *sk, int flag)
{
if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
sock_set_flag(sk, flag);
/*
* we just set one of the two flags which require net
* time stamping, but time stamping might have been on
* already because of the other one
*/
- if (!sock_flag(sk,
- flag == SOCK_TIMESTAMP ?
- SOCK_TIMESTAMPING_RX_SOFTWARE :
- SOCK_TIMESTAMP))
+ if (!(previous_flags & SK_FLAGS_TIMESTAMP))
net_enable_timestamp();
}
}
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+ int level, int type)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb, *skb2;
+ int copied, err;
+
+ err = -EAGAIN;
+ skb = skb_dequeue(&sk->sk_error_queue);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+ put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+ /* Reset and regenerate socket error */
+ spin_lock_bh(&sk->sk_error_queue.lock);
+ sk->sk_err = 0;
+ if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+ sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+ spin_unlock_bh(&sk->sk_error_queue.lock);
+ sk->sk_error_report(sk);
+ } else
+ spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
+
/*
* Get a socket option on an socket.
*
@@ -2221,13 +2649,16 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR 64 /* should be enough for the first time */
struct prot_inuse {
@@ -2307,7 +2738,7 @@ static void assign_proto_idx(struct proto *prot)
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
- printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
+ pr_err("PROTO_INUSE_NR exhausted\n");
return;
}
@@ -2337,8 +2768,8 @@ int proto_register(struct proto *prot, int alloc_slab)
NULL);
if (prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
- prot->name);
+ pr_crit("%s: Can't create sock SLAB cache!\n",
+ prot->name);
goto out;
}
@@ -2352,8 +2783,8 @@ int proto_register(struct proto *prot, int alloc_slab)
SLAB_HWCACHE_ALIGN, NULL);
if (prot->rsk_prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
- prot->name);
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
goto out_free_request_sock_slab_name;
}
}
@@ -2376,10 +2807,10 @@ int proto_register(struct proto *prot, int alloc_slab)
}
}
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list);
assign_proto_idx(prot);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
return 0;
out_free_timewait_sock_slab_name:
@@ -2402,10 +2833,10 @@ EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
release_proto_idx(prot);
list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
@@ -2428,9 +2859,9 @@ EXPORT_SYMBOL(proto_unregister);
#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(proto_list_lock)
+ __acquires(proto_list_mutex)
{
- read_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
return seq_list_start_head(&proto_list, *pos);
}
@@ -2440,25 +2871,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void proto_seq_stop(struct seq_file *seq, void *v)
- __releases(proto_list_lock)
+ __releases(proto_list_mutex)
{
- read_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
}
static char proto_method_implemented(const void *method)
{
return method == NULL ? 'n' : 'y';
}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
+
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto),
- proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
- proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
proto->max_header,
proto->slab == NULL ? "no" : "yes",
module_name(proto->owner),
@@ -2524,7 +2966,7 @@ static const struct file_operations proto_seq_fops = {
static __net_init int proto_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
+ if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
return -ENOMEM;
return 0;
@@ -2532,7 +2974,7 @@ static __net_init int proto_init_net(struct net *net)
static __net_exit void proto_exit_net(struct net *net)
{
- proc_net_remove(net, "protocols");
+ remove_proc_entry("protocols", net->proc_net);
}