aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/af_inet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/af_inet.c')
-rw-r--r--net/ipv4/af_inet.c653
1 files changed, 377 insertions, 276 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b61107df6..d156b3c5f36 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -65,6 +65,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -89,7 +91,6 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/inet.h>
#include <linux/igmp.h>
@@ -105,14 +106,15 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
-#include <net/ipip.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
+#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
@@ -124,9 +126,6 @@
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);
-struct ipv4_config ipv4_config;
-EXPORT_SYMBOL(ipv4_config);
-
/* New destruction routine */
void inet_sock_destruct(struct sock *sk)
@@ -153,8 +152,9 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON(sk->sk_wmem_queued);
WARN_ON(sk->sk_forward_alloc);
- kfree(inet->opt);
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -209,6 +209,26 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
+ /* Check special setups for testing purpose to enable TFO w/o
+ * requiring TCP_FASTOPEN sockopt.
+ * Note that only TCP sockets (SOCK_STREAM) will reach here.
+ * Also fastopenq may already been allocated because this
+ * socket was in TCP_LISTEN state previously but was
+ * shutdown() (rather than close()).
+ */
+ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+ inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+ if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+ err = fastopen_init_queue(sk, backlog);
+ else if ((sysctl_tcp_fastopen &
+ TFO_SERVER_WO_SOCKOPT2) != 0)
+ err = fastopen_init_queue(sk,
+ ((uint)sysctl_tcp_fastopen) >> 16);
+ else
+ err = 0;
+ if (err)
+ goto out;
+ }
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
@@ -222,41 +242,6 @@ out:
}
EXPORT_SYMBOL(inet_listen);
-u32 inet_ehash_secret __read_mostly;
-EXPORT_SYMBOL(inet_ehash_secret);
-
-/*
- * inet_ehash_secret must be set exactly once
- */
-void build_ehash_secret(void)
-{
- u32 rnd;
-
- do {
- get_random_bytes(&rnd, sizeof(rnd));
- } while (rnd == 0);
-
- cmpxchg(&inet_ehash_secret, 0, rnd);
-}
-EXPORT_SYMBOL(build_ehash_secret);
-
-static inline int inet_netns_ok(struct net *net, int protocol)
-{
- int hash;
- const struct net_protocol *ipprot;
-
- if (net_eq(net, &init_net))
- return 1;
-
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
-
- if (ipprot == NULL)
- /* raw IP is OK */
- return 1;
- return ipprot->netns_ok;
-}
-
/*
* Create an inet socket.
*/
@@ -269,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
- if (unlikely(!inet_ehash_secret))
- if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
- build_ehash_secret();
-
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -325,16 +305,12 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
- goto out_rcu_unlock;
-
- err = -EAFNOSUPPORT;
- if (!inet_netns_ok(net, protocol))
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
@@ -346,9 +322,8 @@ lookup_protocol:
goto out;
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -361,7 +336,7 @@ lookup_protocol:
inet->hdrincl = 1;
}
- if (ipv4_config.no_pmtu_disc)
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
@@ -380,6 +355,7 @@ lookup_protocol:
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
+ inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
@@ -451,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
int err;
@@ -464,7 +441,17 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+ }
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -484,7 +471,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
@@ -528,7 +516,7 @@ out:
}
EXPORT_SYMBOL(inet_bind);
-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -540,15 +528,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending += writebias;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -564,6 +553,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
return timeo;
}
@@ -571,8 +561,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
@@ -581,8 +571,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
- lock_sock(sk);
-
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -622,8 +610,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
/* Error code is set above */
- if (!timeo || !inet_wait_for_connect(sk, timeo))
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;
err = sock_intr_errno(timeo);
@@ -645,7 +637,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
err = 0;
out:
- release_sock(sk);
return err;
sock_error:
@@ -655,6 +646,18 @@ sock_error:
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
EXPORT_SYMBOL(inet_stream_connect);
/*
@@ -672,8 +675,10 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
lock_sock(sk2);
+ sock_rps_record_flow(sk2);
WARN_ON(!((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE)));
sock_graft(sk2, newsock);
@@ -880,6 +885,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
EXPORT_SYMBOL(inet_ioctl);
+#ifdef CONFIG_COMPAT
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -903,6 +921,7 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
@@ -929,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
@@ -959,6 +979,7 @@ static const struct proto_ops inet_sockraw_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
@@ -978,7 +999,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
@@ -988,17 +1008,22 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+ },
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
@@ -1048,13 +1073,11 @@ out:
return;
out_permanent:
- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
- protocol);
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
- printk(KERN_ERR
- "Ignoring attempt to register invalid socket type %d.\n",
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
@@ -1063,8 +1086,7 @@ EXPORT_SYMBOL(inet_register_protosw);
void inet_unregister_protosw(struct inet_protosw *p)
{
if (INET_PROTOSW_PERMANENT & p->flags) {
- printk(KERN_ERR
- "Attempt to unregister permanent protocol %d.\n",
+ pr_err("Attempt to unregister permanent protocol %d\n",
p->protocol);
} else {
spin_lock_bh(&inetsw_lock);
@@ -1085,34 +1107,36 @@ int sysctl_ip_dynaddr __read_mostly;
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
- int err;
- struct rtable *rt;
__be32 old_saddr = inet->inet_saddr;
- __be32 new_saddr;
__be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 new_saddr;
+ struct ip_options_rcu *inet_opt;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
/* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- sk->sk_protocol,
- inet->inet_sport, inet->inet_dport, sk, 0);
- if (err)
- return err;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
sk_setup_caps(sk, &rt->dst);
- new_saddr = rt->rt_src;
+ new_saddr = fl4->saddr;
if (new_saddr == old_saddr)
return 0;
if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
- __func__, &old_saddr, &new_saddr);
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
}
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
@@ -1134,6 +1158,8 @@ int inet_sk_rebuild_header(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
__be32 daddr;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
int err;
/* Route is OK, nothing to do. */
@@ -1141,28 +1167,23 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
daddr = inet->inet_daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-{
- struct flowi fl = {
- .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = inet->inet_saddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = inet->inet_dport,
- };
-
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
-}
- if (!err)
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rcu_read_unlock();
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
sk_setup_caps(sk, &rt->dst);
- else {
+ } else {
+ err = PTR_ERR(rt);
+
/* Routing failed... */
sk->sk_route_caps = 0;
/*
@@ -1182,8 +1203,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
- struct iphdr *iph;
- const struct net_protocol *ops;
+ const struct net_offload *ops;
+ const struct iphdr *iph;
int proto;
int ihl;
int err = -EINVAL;
@@ -1196,46 +1217,55 @@ static int inet_gso_send_check(struct sk_buff *skb)
if (ihl < sizeof(*iph))
goto out;
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
err = -EPROTONOSUPPORT;
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_send_check))
- err = ops->gso_send_check(skb);
- rcu_read_unlock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_send_check))
+ err = ops->callbacks.gso_send_check(skb);
out:
return err;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ unsigned int offset = 0;
+ bool udpfrag, encap;
struct iphdr *iph;
- const struct net_protocol *ops;
int proto;
+ int nhoff;
int ihl;
int id;
- unsigned int offset = 0;
-
- if (!(features & NETIF_F_V4_CSUM))
- features &= ~NETIF_F_SG;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_MPLS |
0)))
goto out;
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
@@ -1244,39 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
if (ihl < sizeof(*iph))
goto out;
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ SKB_GSO_CB(skb)->encap_level += ihl;
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- id = ntohs(iph->id);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+
segs = ERR_PTR(-EPROTONOSUPPORT);
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_segment))
- segs = ops->gso_segment(skb, features);
- rcu_read_unlock();
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
- if (!segs || IS_ERR(segs))
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
- iph = ip_hdr(skb);
- if (proto == IPPROTO_UDP) {
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
- offset += (skb->len - skb->mac_len - iph->ihl * 4);
- } else
+ offset += skb->len - nhoff - ihl;
+ } else {
iph->id = htons(id++);
- iph->tot_len = htons(skb->len - skb->mac_len);
- iph->check = 0;
- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ }
+ iph->tot_len = htons(skb->len - nhoff);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
} while ((skb = skb->next));
out:
@@ -1286,10 +1330,10 @@ out:
static struct sk_buff **inet_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
- const struct net_protocol *ops;
+ const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
- struct iphdr *iph;
+ const struct iphdr *iph;
unsigned int hlen;
unsigned int off;
unsigned int id;
@@ -1305,21 +1349,21 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
goto out;
}
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (!ops || !ops->gro_receive)
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
goto out_unlock;
if (*(u8 *)iph != 0x45)
goto out_unlock;
- if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out_unlock;
id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
for (p = *head; p; p = p->next) {
@@ -1328,10 +1372,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (!NAPI_GRO_CB(p)->same_flow)
continue;
- iph2 = ip_hdr(p);
-
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
if ((iph->protocol ^ iph2->protocol) |
- (iph->tos ^ iph2->tos) |
((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
NAPI_GRO_CB(p)->same_flow = 0;
@@ -1341,16 +1388,29 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
(iph->ttl ^ iph2->ttl) |
- ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+ (iph->tos ^ iph2->tos) |
+ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+ /* Save the IP ID check to be included later when we get to
+ * the transport layer so only the inner most IP ID is checked.
+ * This is because some GSO/TSO implementations do not
+ * correctly increment the IP ID for the outer hdrs.
+ */
+ NAPI_GRO_CB(p)->flush_id =
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
}
NAPI_GRO_CB(skb)->flush |= flush;
+ skb_set_network_header(skb, off);
+ /* The above will be needed by the transport layer if there is one
+ * immediately following this IP hdr.
+ */
+
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = ops->gro_receive(head, skb);
+ pp = ops->callbacks.gro_receive(head, skb);
out_unlock:
rcu_read_unlock();
@@ -1361,23 +1421,30 @@ out:
return pp;
}
-static int inet_gro_complete(struct sk_buff *skb)
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
- const struct net_protocol *ops;
- struct iphdr *iph = ip_hdr(skb);
- int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ __be16 newlen = htons(skb->len - nhoff);
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ int proto = iph->protocol;
int err = -ENOSYS;
- __be16 newlen = htons(skb->len - skb_network_offset(skb));
+
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (WARN_ON(!ops || !ops->gro_complete))
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
- err = ops->gro_complete(skb);
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
out_unlock:
rcu_read_unlock();
@@ -1407,82 +1474,44 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
int i;
- for_each_possible_cpu(i) {
- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
- }
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
#if BITS_PER_LONG==32
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
u64 res = 0;
int cpu;
for_each_possible_cpu(cpu) {
- void *bhptr, *userptr;
+ void *bhptr;
struct u64_stats_sync *syncp;
- u64 v_bh, v_user;
+ u64 v;
unsigned int start;
- /* first mib used by softirq context, we must use _bh() accessors */
- bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+ bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
- start = u64_stats_fetch_begin_bh(syncp);
- v_bh = *(((u64 *) bhptr) + offt);
- } while (u64_stats_fetch_retry_bh(syncp, start));
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
- /* second mib used in USER context */
- userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
- syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
- do {
- start = u64_stats_fetch_begin(syncp);
- v_user = *(((u64 *) userptr) + offt);
- } while (u64_stats_fetch_retry(syncp, start));
-
- res += v_bh + v_user;
+ res += v;
}
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
-{
- BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize, align);
- if (!ptr[0])
- goto err0;
- ptr[1] = __alloc_percpu(mibsize, align);
- if (!ptr[1])
- goto err1;
- return 0;
-err1:
- free_percpu(ptr[0]);
- ptr[0] = NULL;
-err0:
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_init);
-
-void snmp_mib_free(void __percpu *ptr[2])
-{
- BUG_ON(ptr == NULL);
- free_percpu(ptr[0]);
- free_percpu(ptr[1]);
- ptr[0] = ptr[1] = NULL;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_free);
-
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1491,90 +1520,91 @@ static const struct net_protocol igmp_protocol = {
#endif
static const struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- .no_policy = 1,
- .netns_ok = 1,
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
};
static const struct net_protocol udp_protocol = {
+ .early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
- .gso_send_check = udp4_ufo_send_check,
- .gso_segment = udp4_ufo_fragment,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
+ .err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};
static __net_init int ipv4_mib_init_net(struct net *net)
{
- if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
- sizeof(struct tcp_mib),
- __alignof__(struct tcp_mib)) < 0)
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
goto err_tcp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
- sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
goto err_ip_mib;
- if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
- sizeof(struct linux_mib),
- __alignof__(struct linux_mib)) < 0)
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
+
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
goto err_net_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
goto err_udp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
goto err_udplite_mib;
- if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
- sizeof(struct icmp_mib),
- __alignof__(struct icmp_mib)) < 0)
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
goto err_icmp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
- sizeof(struct icmpmsg_mib),
- __alignof__(struct icmpmsg_mib)) < 0)
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
goto err_icmpmsg_mib;
tcp_mib_init(net);
return 0;
err_icmpmsg_mib:
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ free_percpu(net->mib.udplite_statistics);
err_udplite_mib:
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ free_percpu(net->mib.udp_statistics);
err_udp_mib:
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ free_percpu(net->mib.net_statistics);
err_net_mib:
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ free_percpu(net->mib.ip_statistics);
err_ip_mib:
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
return -ENOMEM;
}
static __net_exit void ipv4_mib_exit_net(struct net *net)
{
- snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ kfree(net->mib.icmpmsg_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1587,37 +1617,95 @@ static int __init init_ipv4_mibs(void)
return register_pernet_subsys(&ipv4_mib_ops);
}
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
static int ipv4_proc_init(void);
/*
* IP protocol layer initialiser
*/
+static struct packet_offload ip_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ },
+};
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (udpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+
+ dev_add_offload(&ip_packet_offload);
+ inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
};
static int __init inet_init(void)
{
- struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
-
- sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
- if (!sysctl_local_reserved_ports)
- goto out;
+ BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out_free_reserved_ports;
+ goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
@@ -1627,6 +1715,10 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_udp_proto;
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
/*
* Tell SOCKET that we are alive...
*/
@@ -1642,14 +1734,14 @@ static int __init inet_init(void)
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
@@ -1682,6 +1774,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ ping_init();
+
/*
* Set the ICMP layer up
*/
@@ -1694,14 +1788,17 @@ static int __init inet_init(void)
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
@@ -1712,12 +1809,12 @@ static int __init inet_init(void)
rc = 0;
out:
return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
-out_free_reserved_ports:
- kfree(sysctl_local_reserved_ports);
goto out;
}
@@ -1736,11 +1833,15 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
+ if (ping_proc_init())
+ goto out_ping;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
+ ping_proc_exit();
+out_ping:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();