aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile6
-rw-r--r--net/ipv4/af_inet.c297
-rw-r--r--net/ipv4/ah4.c78
-rw-r--r--net/ipv4/arp.c53
-rw-r--r--net/ipv4/cipso_ipv4.c12
-rw-r--r--net/ipv4/datagram.c24
-rw-r--r--net/ipv4/devinet.c86
-rw-r--r--net/ipv4/esp4.c75
-rw-r--r--net/ipv4/fib_frontend.c8
-rw-r--r--net/ipv4/fib_lookup.h24
-rw-r--r--net/ipv4/fib_rules.c5
-rw-r--r--net/ipv4/fib_semantics.c10
-rw-r--r--net/ipv4/fib_trie.c28
-rw-r--r--net/ipv4/gre_demux.c74
-rw-r--r--net/ipv4/gre_offload.c214
-rw-r--r--net/ipv4/icmp.c56
-rw-r--r--net/ipv4/igmp.c114
-rw-r--r--net/ipv4/inet_connection_sock.c67
-rw-r--r--net/ipv4/inet_diag.c141
-rw-r--r--net/ipv4/inet_fragment.c8
-rw-r--r--net/ipv4/inet_hashtables.c116
-rw-r--r--net/ipv4/inet_lro.c173
-rw-r--r--net/ipv4/inet_timewait_sock.c59
-rw-r--r--net/ipv4/inetpeer.c33
-rw-r--r--net/ipv4/ip_forward.c33
-rw-r--r--net/ipv4/ip_fragment.c8
-rw-r--r--net/ipv4/ip_gre.c16
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_options.c52
-rw-r--r--net/ipv4/ip_output.c151
-rw-r--r--net/ipv4/ip_sockglue.c56
-rw-r--r--net/ipv4/ip_tunnel.c259
-rw-r--r--net/ipv4/ip_tunnel_core.c100
-rw-r--r--net/ipv4/ip_vti.c367
-rw-r--r--net/ipv4/ipcomp.c26
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipip.c16
-rw-r--r--net/ipv4/ipmr.c28
-rw-r--r--net/ipv4/netfilter.c2
-rw-r--r--net/ipv4/netfilter/Kconfig36
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/arp_tables.c11
-rw-r--r--net/ipv4/netfilter/arptable_filter.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c11
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c112
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c140
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c14
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c7
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c5
-rw-r--r--net/ipv4/netfilter/iptable_filter.c7
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c10
-rw-r--r--net/ipv4/netfilter/iptable_nat.c40
-rw-r--r--net/ipv4/netfilter/iptable_raw.c6
-rw-r--r--net/ipv4/netfilter/iptable_security.c7
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c18
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c19
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c104
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c129
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c199
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c90
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c75
-rw-r--r--net/ipv4/ping.c140
-rw-r--r--net/ipv4/proc.c39
-rw-r--r--net/ipv4/protocol.c8
-rw-r--r--net/ipv4/raw.c26
-rw-r--r--net/ipv4/route.c156
-rw-r--r--net/ipv4/syncookies.c85
-rw-r--r--net/ipv4/sysctl_net_ipv4.c218
-rw-r--r--net/ipv4/tcp.c136
-rw-r--r--net/ipv4/tcp_bic.c6
-rw-r--r--net/ipv4/tcp_cong.c80
-rw-r--r--net/ipv4/tcp_cubic.c12
-rw-r--r--net/ipv4/tcp_fastopen.c238
-rw-r--r--net/ipv4/tcp_highspeed.c7
-rw-r--r--net/ipv4/tcp_htcp.c6
-rw-r--r--net/ipv4/tcp_hybla.c19
-rw-r--r--net/ipv4/tcp_illinois.c9
-rw-r--r--net/ipv4/tcp_input.c425
-rw-r--r--net/ipv4/tcp_ipv4.c462
-rw-r--r--net/ipv4/tcp_lp.c6
-rw-r--r--net/ipv4/tcp_memcontrol.c128
-rw-r--r--net/ipv4/tcp_metrics.c354
-rw-r--r--net/ipv4/tcp_minisocks.c47
-rw-r--r--net/ipv4/tcp_offload.c77
-rw-r--r--net/ipv4/tcp_output.c443
-rw-r--r--net/ipv4/tcp_probe.c35
-rw-r--r--net/ipv4/tcp_scalable.c7
-rw-r--r--net/ipv4/tcp_timer.c12
-rw-r--r--net/ipv4/tcp_vegas.c12
-rw-r--r--net/ipv4/tcp_vegas.h10
-rw-r--r--net/ipv4/tcp_veno.c11
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c28
-rw-r--r--net/ipv4/udp.c443
-rw-r--r--net/ipv4/udp_impl.h36
-rw-r--r--net/ipv4/udp_offload.c188
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_input.c9
-rw-r--r--net/ipv4/xfrm4_mode_beet.c2
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c70
-rw-r--r--net/ipv4/xfrm4_output.c36
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv4/xfrm4_protocol.c301
-rw-r--r--net/ipv4/xfrm4_state.c2
106 files changed, 4988 insertions, 3297 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4b81e91c80f..f032688d20d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
- inet_fragment.o ping.o ip_tunnel_core.o
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
@@ -19,7 +19,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
-gre-y := gre_demux.o gre_offload.o
+gre-y := gre_demux.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
@@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
- xfrm4_output.o
+ xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7a1874b7b8f..d156b3c5f36 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -126,9 +126,6 @@
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);
-struct ipv4_config ipv4_config;
-EXPORT_SYMBOL(ipv4_config);
-
/* New destruction routine */
void inet_sock_destruct(struct sock *sk)
@@ -245,31 +242,6 @@ out:
}
EXPORT_SYMBOL(inet_listen);
-u32 inet_ehash_secret __read_mostly;
-EXPORT_SYMBOL(inet_ehash_secret);
-
-u32 ipv6_hash_secret __read_mostly;
-EXPORT_SYMBOL(ipv6_hash_secret);
-
-/*
- * inet_ehash_secret must be set exactly once, and to a non nul value
- * ipv6_hash_secret must be set exactly once.
- */
-void build_ehash_secret(void)
-{
- u32 rnd;
-
- do {
- get_random_bytes(&rnd, sizeof(rnd));
- } while (rnd == 0);
-
- if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) {
- get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
- net_secret_init();
- }
-}
-EXPORT_SYMBOL(build_ehash_secret);
-
/*
* Create an inet socket.
*/
@@ -282,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
- if (unlikely(!inet_ehash_secret))
- if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
- build_ehash_secret();
-
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -344,7 +311,6 @@ lookup_protocol:
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
@@ -356,7 +322,6 @@ lookup_protocol:
goto out;
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
@@ -371,7 +336,7 @@ lookup_protocol:
inet->hdrincl = 1;
}
- if (ipv4_config.no_pmtu_disc)
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
@@ -1034,7 +999,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
@@ -1044,7 +1008,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
@@ -1053,7 +1016,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
@@ -1062,7 +1024,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
@@ -1162,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
sk->sk_bound_dev_if, sk->sk_protocol,
- inet->inet_sport, inet->inet_dport, sk, false);
+ inet->inet_sport, inet->inet_dport, sk);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -1256,36 +1217,36 @@ static int inet_gso_send_check(struct sk_buff *skb)
if (ihl < sizeof(*iph))
goto out;
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- proto = iph->protocol;
err = -EPROTONOSUPPORT;
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_send_check))
err = ops->callbacks.gso_send_check(skb);
- rcu_read_unlock();
out:
return err;
}
static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
+ unsigned int offset = 0;
+ bool udpfrag, encap;
struct iphdr *iph;
int proto;
+ int nhoff;
int ihl;
int id;
- unsigned int offset = 0;
- bool tunnel;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
@@ -1293,12 +1254,18 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
SKB_GSO_TCPV6 |
SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
SKB_GSO_MPLS |
0)))
goto out;
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
@@ -1307,42 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (ihl < sizeof(*iph))
goto out;
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
+ __skb_pull(skb, ihl);
- tunnel = !!skb->encapsulation;
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ SKB_GSO_CB(skb)->encap_level += ihl;
- __skb_pull(skb, ihl);
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- id = ntohs(iph->id);
- proto = iph->protocol;
+
segs = ERR_PTR(-EPROTONOSUPPORT);
- rcu_read_lock();
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
segs = ops->callbacks.gso_segment(skb, features);
- rcu_read_unlock();
if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
- iph = ip_hdr(skb);
- if (!tunnel && proto == IPPROTO_UDP) {
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
- offset += (skb->len - skb->mac_len - iph->ihl * 4);
- } else {
+ offset += skb->len - nhoff - ihl;
+ } else {
iph->id = htons(id++);
}
- iph->tot_len = htons(skb->len - skb->mac_len);
- iph->check = 0;
- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ iph->tot_len = htons(skb->len - nhoff);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
} while ((skb = skb->next));
out:
@@ -1394,8 +1372,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (!NAPI_GRO_CB(p)->same_flow)
continue;
- iph2 = ip_hdr(p);
-
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
if ((iph->protocol ^ iph2->protocol) |
((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
@@ -1407,13 +1389,24 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
NAPI_GRO_CB(p)->flush |=
(iph->ttl ^ iph2->ttl) |
(iph->tos ^ iph2->tos) |
- (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) |
- ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+ /* Save the IP ID check to be included later when we get to
+ * the transport layer so only the inner most IP ID is checked.
+ * This is because some GSO/TSO implementations do not
+ * correctly increment the IP ID for the outer hdrs.
+ */
+ NAPI_GRO_CB(p)->flush_id =
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
}
NAPI_GRO_CB(skb)->flush |= flush;
+ skb_set_network_header(skb, off);
+ /* The above will be needed by the transport layer if there is one
+ * immediately following this IP hdr.
+ */
+
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
@@ -1428,14 +1421,17 @@ out:
return pp;
}
-static int inet_gro_complete(struct sk_buff *skb)
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
- __be16 newlen = htons(skb->len - skb_network_offset(skb));
- struct iphdr *iph = ip_hdr(skb);
+ __be16 newlen = htons(skb->len - nhoff);
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
const struct net_offload *ops;
int proto = iph->protocol;
int err = -ENOSYS;
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
+
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
@@ -1444,7 +1440,11 @@ static int inet_gro_complete(struct sk_buff *skb)
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
- err = ops->callbacks.gro_complete(skb);
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
out_unlock:
rcu_read_unlock();
@@ -1474,22 +1474,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
- int i, j;
+ int i;
- for_each_possible_cpu(i) {
- for (j = 0; j < SNMP_ARRAY_SZ; j++)
- res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
- }
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
#if BITS_PER_LONG==32
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
u64 res = 0;
int cpu;
@@ -1500,12 +1498,12 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
u64 v;
unsigned int start;
- bhptr = per_cpu_ptr(mib[0], cpu);
+ bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
- start = u64_stats_fetch_begin_bh(syncp);
+ start = u64_stats_fetch_begin_irq(syncp);
v = *(((u64 *) bhptr) + offt);
- } while (u64_stats_fetch_retry_bh(syncp, start));
+ } while (u64_stats_fetch_retry_irq(syncp, start));
res += v;
}
@@ -1514,24 +1512,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
-{
- BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize, align);
- if (!ptr[0])
- return -ENOMEM;
-#if SNMP_ARRAY_SZ == 2
- ptr[1] = __alloc_percpu(mibsize, align);
- if (!ptr[1]) {
- free_percpu(ptr[0]);
- ptr[0] = NULL;
- return -ENOMEM;
- }
-#endif
- return 0;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_init);
-
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1545,9 +1525,11 @@ static const struct net_protocol tcp_protocol = {
.err_handler = tcp_v4_err,
.no_policy = 1,
.netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
};
static const struct net_protocol udp_protocol = {
+ .early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
@@ -1563,29 +1545,32 @@ static const struct net_protocol icmp_protocol = {
static __net_init int ipv4_mib_init_net(struct net *net)
{
- if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
- sizeof(struct tcp_mib),
- __alignof__(struct tcp_mib)) < 0)
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
goto err_tcp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
- sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
goto err_ip_mib;
- if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
- sizeof(struct linux_mib),
- __alignof__(struct linux_mib)) < 0)
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
+
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
goto err_net_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
goto err_udp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
goto err_udplite_mib;
- if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
- sizeof(struct icmp_mib),
- __alignof__(struct icmp_mib)) < 0)
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
goto err_icmp_mib;
net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
GFP_KERNEL);
@@ -1596,17 +1581,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)
return 0;
err_icmpmsg_mib:
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ free_percpu(net->mib.udplite_statistics);
err_udplite_mib:
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ free_percpu(net->mib.udp_statistics);
err_udp_mib:
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ free_percpu(net->mib.net_statistics);
err_net_mib:
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ free_percpu(net->mib.ip_statistics);
err_ip_mib:
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
return -ENOMEM;
}
@@ -1614,12 +1599,12 @@ err_tcp_mib:
static __net_exit void ipv4_mib_exit_net(struct net *net)
{
kfree(net->mib.icmpmsg_statistics);
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1632,6 +1617,39 @@ static int __init init_ipv4_mibs(void)
return register_pernet_subsys(&ipv4_mib_ops);
}
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
static int ipv4_proc_init(void);
/*
@@ -1648,6 +1666,13 @@ static struct packet_offload ip_packet_offload __read_mostly = {
},
};
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ },
+};
+
static int __init ipv4_offload_init(void)
{
/*
@@ -1659,6 +1684,7 @@ static int __init ipv4_offload_init(void)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
dev_add_offload(&ip_packet_offload);
+ inet_add_offload(&ipip_offload, IPPROTO_IPIP);
return 0;
}
@@ -1677,13 +1703,9 @@ static int __init inet_init(void)
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
- sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
- if (!sysctl_local_reserved_ports)
- goto out;
-
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out_free_reserved_ports;
+ goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
@@ -1707,8 +1729,6 @@ static int __init inet_init(void)
ip_static_sysctl_init();
#endif
- tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
-
/*
* Add all the base protocols.
*/
@@ -1770,6 +1790,9 @@ static int __init inet_init(void)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
@@ -1792,8 +1815,6 @@ out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
-out_free_reserved_ports:
- kfree(sysctl_local_reserved_ports);
goto out;
}
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 717902669d2..a2afa89513a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *iph, *top_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
ahp = x->data;
ahash = ahp->ahash;
@@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
ah = ip_auth_hdr(skb);
ihl = ip_hdrlen(skb);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
err = -ENOMEM;
- iph = ah_alloc_tmp(ahash, nfrags, ihl);
+ iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
if (!iph)
goto out;
-
- icv = ah_tmp_icv(ahash, iph, ihl);
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
memset(ah->auth_data, 0, ahp->icv_trunc_len);
@@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
ah->spi = x->id.spi;
ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
- ahash_request_set_crypt(req, sg, icv, skb->len);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
ahash_request_set_callback(req, 0, ah_output_done, skb);
AH_SKB_CB(skb)->tmp = iph;
@@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
struct ip_auth_hdr *ah;
struct ah_data *ahp;
int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
if (!pskb_may_pull(skb, sizeof(*ah)))
goto out;
@@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
iph = ip_hdr(skb);
ihl = ip_hdrlen(skb);
- work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+ ahp->icv_trunc_len + seqhi_len);
if (!work_iph)
goto out;
- auth_data = ah_tmp_auth(work_iph, ihl);
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
memcpy(work_iph, iph, ihl);
memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
@@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb_push(skb, ihl);
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
- ahash_request_set_crypt(req, sg, icv, skb->len);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
ahash_request_set_callback(req, 0, ah_input_done, skb);
AH_SKB_CB(skb)->tmp = work_iph;
@@ -397,7 +428,7 @@ out:
return err;
}
-static void ah4_err(struct sk_buff *skb, u32 info)
+static int ah4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
switch (icmp_hdr(skb)->type) {
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ return 0;
case ICMP_REDIRECT:
break;
default:
- return;
+ return 0;
}
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
ah->spi, IPPROTO_AH, AF_INET);
if (!x)
- return;
+ return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
else
ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
xfrm_state_put(x);
+
+ return 0;
}
static int ah_init_state(struct xfrm_state *x)
@@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x)
kfree(ahp);
}
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
static const struct xfrm_type ah_type =
{
@@ -518,11 +555,12 @@ static const struct xfrm_type ah_type =
.output = ah_output
};
-static const struct net_protocol ah4_protocol = {
+static struct xfrm4_protocol ah4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah4_rcv_cb,
.err_handler = ah4_err,
- .no_policy = 1,
- .netns_ok = 1,
+ .priority = 0,
};
static int __init ah4_init(void)
@@ -531,7 +569,7 @@ static int __init ah4_init(void)
pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+ if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ah_type, AF_INET);
return -EAGAIN;
@@ -541,7 +579,7 @@ static int __init ah4_init(void)
static void __exit ah4_fini(void)
{
- if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+ if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7808093cede..1a9b99e0446 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -166,18 +166,20 @@ struct neigh_table arp_tbl = {
.id = "arp_cache",
.parms = {
.tbl = &arp_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
.reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len_bytes = 64*1024,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
@@ -359,14 +361,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
- probes -= neigh->parms->ucast_probes;
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
if (probes < 0) {
if (!(neigh->nud_state & NUD_VALID))
pr_debug("trying to ucast probe in NUD_INVALID\n");
neigh_ha_snapshot(dst_ha, neigh, dev);
dst_hw = dst_ha;
} else {
- probes -= neigh->parms->app_probes;
+ probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
if (probes < 0) {
neigh_app_ns(neigh);
return;
@@ -379,6 +381,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
+ struct net *net = dev_net(in_dev->dev);
int scope;
switch (IN_DEV_ARP_IGNORE(in_dev)) {
@@ -397,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
case 3: /* Do not reply for scope host addresses */
sip = 0;
scope = RT_SCOPE_LINK;
+ in_dev = NULL;
break;
case 4: /* Reserved */
case 5:
@@ -408,7 +412,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
default:
return 0;
}
- return !inet_confirm_addr(in_dev, sip, tip, scope);
+ return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
@@ -728,6 +732,7 @@ static int arp_process(struct sk_buff *skb)
int addr_type;
struct neighbour *n;
struct net *net = dev_net(dev);
+ bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
@@ -871,7 +876,7 @@ static int arp_process(struct sk_buff *skb)
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
- in_dev->arp_parms->proxy_delay == 0) {
+ NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
dev, tip, sha, dev->dev_addr,
sha);
@@ -894,10 +899,12 @@ static int arp_process(struct sk_buff *skb)
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
+ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
+ inet_addr_type(net, sip) == RTN_UNICAST;
+
if (n == NULL &&
- (arp->ar_op == htons(ARPOP_REPLY) ||
- (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
- inet_addr_type(net, sip) == RTN_UNICAST)
+ ((arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -910,7 +917,10 @@ static int arp_process(struct sk_buff *skb)
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
- override = time_after(jiffies, n->updated + n->parms->locktime);
+ override = time_after(jiffies,
+ n->updated +
+ NEIGH_VAR(n->parms, LOCKTIME)) ||
+ is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
@@ -1107,7 +1117,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
return err;
}
-int arp_invalidate(struct net_device *dev, __be32 ip)
+static int arp_invalidate(struct net_device *dev, __be32 ip)
{
struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
int err = -ENXIO;
@@ -1122,7 +1132,6 @@ int arp_invalidate(struct net_device *dev, __be32 ip)
return err;
}
-EXPORT_SYMBOL(arp_invalidate);
static int arp_req_delete_public(struct net *net, struct arpreq *r,
struct net_device *dev)
@@ -1284,7 +1293,7 @@ void __init arp_init(void)
dev_add_pack(&arp_packet_type);
arp_proc_init();
#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
register_netdevice_notifier(&arp_netdev_notifier);
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 667c1d4ca98..69e77c8ff28 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -31,8 +31,7 @@
* the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
@@ -1336,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->attr.mls.cat =
- netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
@@ -1432,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->attr.mls.cat =
- netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
@@ -1527,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->attr.mls.cat =
- netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b28e863fe0a..a3095fdefbe 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -53,11 +53,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
RT_CONN_FLAGS(sk), oif,
sk->sk_protocol,
- inet->inet_sport, usin->sin_port, sk, true);
+ inet->inet_sport, usin->sin_port, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -86,18 +86,26 @@ out:
}
EXPORT_SYMBOL(ip4_datagram_connect);
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
void ip4_datagram_release_cb(struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
const struct ip_options_rcu *inet_opt;
__be32 daddr = inet->inet_daddr;
+ struct dst_entry *dst;
struct flowi4 fl4;
struct rtable *rt;
- if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0))
- return;
-
rcu_read_lock();
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ rcu_read_unlock();
+ return;
+ }
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
@@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk)
inet->inet_saddr, inet->inet_dport,
inet->inet_sport, sk->sk_protocol,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
- if (!IS_ERR(rt))
- __sk_dst_set(sk, &rt->dst);
+
+ dst = !IS_ERR(rt) ? &rt->dst : NULL;
+ sk_dst_set(sk, dst);
+
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a1b5bcbd04a..e9449376b58 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,13 +99,13 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_BROADCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .type = NLA_U32 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
-static DEFINE_SPINLOCK(inet_addr_hash_lock);
static u32 inet_addr_hash(struct net *net, __be32 addr)
{
@@ -118,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
{
u32 hash = inet_addr_hash(net, ifa->ifa_local);
- spin_lock(&inet_addr_hash_lock);
+ ASSERT_RTNL();
hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
- spin_unlock(&inet_addr_hash_lock);
}
static void inet_hash_remove(struct in_ifaddr *ifa)
{
- spin_lock(&inet_addr_hash_lock);
+ ASSERT_RTNL();
hlist_del_init_rcu(&ifa->hash);
- spin_unlock(&inet_addr_hash_lock);
}
/**
@@ -463,7 +461,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
}
if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
- net_srandom(ifa->ifa_local);
+ prandom_seed((__force u32) ifa->ifa_local);
ifap = last_primary;
}
@@ -473,7 +471,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
inet_hash_insert(dev_net(in_dev->dev), ifa);
cancel_delayed_work(&check_lifetime_work);
- schedule_delayed_work(&check_lifetime_work, 0);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
@@ -500,6 +498,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
return -ENOBUFS;
}
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
if (ifa->ifa_dev != in_dev) {
WARN_ON(ifa->ifa_dev);
in_dev_hold(in_dev);
@@ -682,7 +681,8 @@ static void check_lifetime(struct work_struct *work)
if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
- schedule_delayed_work(&check_lifetime_work, next_sched - now);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
+ next_sched - now);
}
static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
@@ -747,6 +747,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
goto errout;
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
in_dev_hold(in_dev);
if (tb[IFA_ADDRESS] == NULL)
@@ -755,7 +756,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
- ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+ ifm->ifa_flags;
ifa->ifa_scope = ifm->ifa_scope;
ifa->ifa_dev = in_dev;
@@ -825,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
ifa_existing = find_matching_ifa(ifa);
if (!ifa_existing) {
/* It would be best to check for !NLM_F_CREATE here but
- * userspace alreay relies on not having to provide this.
+ * userspace already relies on not having to provide this.
*/
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
@@ -838,7 +840,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
ifa = ifa_existing;
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
cancel_delayed_work(&check_lifetime_work);
- schedule_delayed_work(&check_lifetime_work, 0);
+ queue_delayed_work(system_power_efficient_wq,
+ &check_lifetime_work, 0);
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
}
@@ -1236,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
/*
* Confirm that local IP address exists using wildcards:
- * - in_dev: only on this interface, 0=any interface
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
* - dst: only in the same subnet as dst, 0=any dst
* - local: address, 0=autoselect the local address
* - scope: maximum allowed scope value for the local address
*/
-__be32 inet_confirm_addr(struct in_device *in_dev,
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
__be32 dst, __be32 local, int scope)
{
__be32 addr = 0;
struct net_device *dev;
- struct net *net;
- if (scope != RT_SCOPE_LINK)
+ if (in_dev != NULL)
return confirm_addr_indev(in_dev, dst, local, scope);
- net = dev_net(in_dev->dev);
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
in_dev = __in_dev_get_rcu(dev);
@@ -1382,6 +1384,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
INFINITY_LIFE_TIME);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
inet_insert_ifa(ifa);
}
}
@@ -1435,7 +1439,9 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_ADDRESS */
+ nla_total_size(4) /* IFA_LOCAL */
+ nla_total_size(4) /* IFA_BROADCAST */
- + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
static inline u32 cstamp_delta(unsigned long cstamp)
@@ -1503,6 +1509,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
preferred, valid))
goto nla_put_failure;
@@ -1691,6 +1698,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (type == -1 || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
return size;
}
@@ -1727,6 +1736,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+ goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1764,6 +1777,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
[NETCONFA_IFINDEX] = { .len = sizeof(int) },
[NETCONFA_FORWARDING] = { .len = sizeof(int) },
[NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
@@ -1945,6 +1959,19 @@ static void inet_forward_change(struct net *net)
}
}
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
+{
+ if (cnf == net->ipv4.devconf_dflt)
+ return NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ return NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev
+ = container_of(cnf, struct in_device, cnf);
+ return idev->dev->ifindex;
+ }
+}
+
static int devinet_conf_proc(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
@@ -1957,6 +1984,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
struct ipv4_devconf *cnf = ctl->extra1;
struct net *net = ctl->extra2;
int i = (int *)ctl->data - cnf->data;
+ int ifindex;
set_bit(i, cnf->state);
@@ -1966,23 +1994,19 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
- int ifindex;
-
- if (cnf == net->ipv4.devconf_dflt)
- ifindex = NETCONFA_IFINDEX_DEFAULT;
- else if (cnf == net->ipv4.devconf_all)
- ifindex = NETCONFA_IFINDEX_ALL;
- else {
- struct in_device *idev =
- container_of(cnf, struct in_device,
- cnf);
- ifindex = idev->dev->ifindex;
- }
+ ifindex = devinet_conf_ifindex(net, cnf);
inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
ifindex, cnf);
}
+ if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ ifindex, cnf);
+ }
}
return ret;
@@ -2160,7 +2184,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
static void devinet_sysctl_register(struct in_device *idev)
{
- neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
+ neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
&idev->cnf);
}
@@ -2298,7 +2322,7 @@ void __init devinet_init(void)
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
- schedule_delayed_work(&check_lifetime_work, 0);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
rtnl_af_register(&inet_af_ops);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 109ee89f123..360b565918c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -121,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct aead_givcrypt_request *req;
struct scatterlist *sg;
struct scatterlist *asg;
- struct esp_data *esp;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -139,8 +138,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
/* skb is pure payload to encrypt */
- esp = x->data;
- aead = esp->aead;
+ aead = x->data;
alen = crypto_aead_authsize(aead);
tfclen = 0;
@@ -154,8 +152,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
- if (esp->padlen)
- clen = ALIGN(clen, esp->padlen);
plen = clen - skb->len - tfclen;
err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
@@ -280,8 +276,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)
{
const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int elen = skb->len - hlen;
@@ -376,8 +371,7 @@ static void esp_input_done(struct crypto_async_request *base, int err)
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
@@ -459,9 +453,8 @@ out:
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
- u32 align = max_t(u32, blksize, esp->padlen);
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
unsigned int net_adj;
switch (x->props.mode) {
@@ -476,11 +469,11 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
BUG();
}
- return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
- net_adj) & ~(align - 1)) + net_adj - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
}
-static void esp4_err(struct sk_buff *skb, u32 info)
+static int esp4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -490,39 +483,39 @@ static void esp4_err(struct sk_buff *skb, u32 info)
switch (icmp_hdr(skb)->type) {
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ return 0;
case ICMP_REDIRECT:
break;
default:
- return;
+ return 0;
}
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
- return;
+ return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
else
ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
xfrm_state_put(x);
+
+ return 0;
}
static void esp_destroy(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
+ struct crypto_aead *aead = x->data;
- if (!esp)
+ if (!aead)
return;
- crypto_free_aead(esp->aead);
- kfree(esp);
+ crypto_free_aead(aead);
}
static int esp_init_aead(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
int err;
@@ -531,7 +524,7 @@ static int esp_init_aead(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
err = crypto_aead_setkey(aead, x->aead->alg_key,
(x->aead->alg_key_len + 7) / 8);
@@ -548,7 +541,6 @@ error:
static int esp_init_authenc(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
struct rtattr *rta;
@@ -583,7 +575,7 @@ static int esp_init_authenc(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
(x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -638,16 +630,11 @@ error:
static int esp_init_state(struct xfrm_state *x)
{
- struct esp_data *esp;
struct crypto_aead *aead;
u32 align;
int err;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
-
- x->data = esp;
+ x->data = NULL;
if (x->aead)
err = esp_init_aead(x);
@@ -657,9 +644,7 @@ static int esp_init_state(struct xfrm_state *x)
if (err)
goto error;
- aead = esp->aead;
-
- esp->padlen = 0;
+ aead = x->data;
x->props.header_len = sizeof(struct ip_esp_hdr) +
crypto_aead_ivsize(aead);
@@ -683,14 +668,17 @@ static int esp_init_state(struct xfrm_state *x)
}
align = ALIGN(crypto_aead_blocksize(aead), 4);
- if (esp->padlen)
- align = max_t(u32, align, esp->padlen);
- x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
return err;
}
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
static const struct xfrm_type esp_type =
{
.description = "ESP4",
@@ -704,11 +692,12 @@ static const struct xfrm_type esp_type =
.output = esp_output
};
-static const struct net_protocol esp4_protocol = {
+static struct xfrm4_protocol esp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp4_rcv_cb,
.err_handler = esp4_err,
- .no_policy = 1,
- .netns_ok = 1,
+ .priority = 0,
};
static int __init esp4_init(void)
@@ -717,7 +706,7 @@ static int __init esp4_init(void)
pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+ if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&esp_type, AF_INET);
return -EAGAIN;
@@ -727,7 +716,7 @@ static int __init esp4_init(void)
static void __exit esp4_fini(void)
{
- if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+ if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b3f627ac4ed..255aa9946fe 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -250,7 +250,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = oif;
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
fl4.flowi4_tos = tos;
@@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
- return ip_rt_dump(skb, cb);
+ return skb->len;
s_h = cb->args[0];
s_e = cb->args[1];
@@ -933,7 +933,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
local_bh_disable();
frn->tb_id = tb->tb_id;
- rcu_read_lock();
frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
@@ -942,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
frn->type = res.type;
frn->scope = res.scope;
}
- rcu_read_unlock();
local_bh_enable();
}
}
@@ -1049,6 +1047,8 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
}
in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return NOTIFY_DONE;
switch (event) {
case NETDEV_UP:
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index af0f14aba16..1e4f6600b31 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -24,21 +24,15 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
}
/* Exported by fib_semantics.c */
-extern void fib_release_info(struct fib_info *);
-extern struct fib_info *fib_create_info(struct fib_config *cfg);
-extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
-extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, __be32 dst,
- int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
-extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info,
- unsigned int nlm_flags);
-extern struct fib_alias *fib_find_alias(struct list_head *fah,
- u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort,
- int *last_idx, int dflt);
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
+ u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
static inline void fib_result_assign(struct fib_result *res,
struct fib_info *fi)
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 523be38e37d..f2e15738534 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -104,7 +104,10 @@ errout:
static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
struct fib_result *result = (struct fib_result *) arg->result;
- struct net_device *dev = result->fi->fib_dev;
+ struct net_device *dev = NULL;
+
+ if (result->fi)
+ dev = result->fi->fib_dev;
/* do not accept result if the route does
* not meet the required prefix length
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d5dbca5ecf6..b10cd43a472 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -380,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
}
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info,
+ int dst_len, u32 tb_id, const struct nl_info *info,
unsigned int nlm_flags)
{
struct sk_buff *skb;
@@ -426,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
return NULL;
}
-int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int dflt)
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx,
+ int dflt)
{
struct neighbour *n;
int state = NUD_NONE;
@@ -630,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
.daddr = nh->nh_gw,
.flowi4_scope = cfg->fc_scope + 1,
.flowi4_oif = nh->nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
};
/* It is not necessary, but requires a bit of thinking */
@@ -819,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (fi == NULL)
goto failure;
+ fib_info_cnt++;
if (cfg->fc_mx) {
fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
if (!fi->fib_metrics)
goto failure;
} else
fi->fib_metrics = (u32 *) dst_default_metrics;
- fib_info_cnt++;
fi->fib_net = hold_net(net);
fi->fib_protocol = cfg->fc_protocol;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3df6d3edb2a..5afeb5aa4c7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -762,12 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- if (tkey_extract_bits(node->key,
- oldtnode->pos + oldtnode->bits,
- 1) == 0)
- put_child(tn, 2*i, node);
- else
- put_child(tn, 2*i+1, node);
+ put_child(tn,
+ tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
+ node);
continue;
}
@@ -1120,12 +1117,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* first tnode need some special handling
*/
- if (tp)
- pos = tp->pos+tp->bits;
- else
- pos = 0;
-
if (n) {
+ pos = tp ? tp->pos+tp->bits : 0;
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
} else {
@@ -2530,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
- int len;
if (fa->fa_type == RTN_BROADCAST
|| fa->fa_type == RTN_MULTICAST)
continue;
+ seq_setwidth(seq, 127);
+
if (fi)
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u%n",
+ "%d\t%08X\t%d\t%u\t%u",
fi->fib_dev ? fi->fib_dev->name : "*",
prefix,
fi->fib_nh->nh_gw, flags, 0, 0,
@@ -2548,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
- fi->fib_rtt >> 3, &len);
+ fi->fib_rtt >> 3);
else
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u%n",
+ "%d\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0, &len);
+ mask, 0, 0, 0);
- seq_printf(seq, "%*s\n", 127 - len, "");
+ seq_pad(seq, '\n');
}
}
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 736c9fc3ef9..0485bf7f8f0 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
skb_push(skb, hdr_len);
+ skb_reset_transport_header(skb);
greh = (struct gre_base_hdr *)skb->data;
greh->flags = tnl_flags_to_gre_flags(tpi->flags);
greh->protocol = tpi->proto;
@@ -84,7 +85,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
ptr--;
}
if (tpi->flags&TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
*ptr = 0;
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
skb->len, 0));
@@ -93,57 +95,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
}
EXPORT_SYMBOL_GPL(gre_build_header);
-struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
-{
- int err;
-
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
-
- if (skb_is_gso(skb)) {
- err = skb_unclone(skb, GFP_ATOMIC);
- if (unlikely(err))
- goto error;
- skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
- return skb;
- } else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
- err = skb_checksum_help(skb);
- if (unlikely(err))
- goto error;
- } else if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->ip_summed = CHECKSUM_NONE;
-
- return skb;
-error:
- kfree_skb(skb);
- return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(gre_handle_offloads);
-
-static __sum16 check_checksum(struct sk_buff *skb)
-{
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
-
- if (!csum)
- break;
- /* Fall through. */
-
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- break;
- }
-
- return csum;
-}
-
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
@@ -170,7 +121,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
- if (check_checksum(skb)) {
+ if (skb_checksum_simple_validate(skb)) {
*csum_err = true;
return -EINVAL;
}
@@ -211,6 +162,14 @@ static int gre_cisco_rcv(struct sk_buff *skb)
int i;
bool csum_err = false;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
if (parse_gre_header(skb, &tpi, &csum_err) < 0)
goto drop;
@@ -384,14 +343,7 @@ static int __init gre_init(void)
goto err_gre;
}
- if (gre_offload_init()) {
- pr_err("can't add protocol offload\n");
- goto err_gso;
- }
-
return 0;
-err_gso:
- gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
err_gre:
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
err:
@@ -400,8 +352,6 @@ err:
static void __exit gre_exit(void)
{
- gre_offload_exit();
-
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 55e6bfb3a28..f0bdd47bbbc 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -11,6 +11,7 @@
*/
#include <linux/skbuff.h>
+#include <linux/init.h>
#include <net/protocol.h>
#include <net/gre.h>
@@ -26,8 +27,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
netdev_features_t enc_features;
- int ghl = GRE_HEADER_SECTION;
+ int ghl;
struct gre_base_hdr *greh;
+ u16 mac_offset = skb->mac_header;
int mac_len = skb->mac_len;
__be16 protocol = skb->protocol;
int tnl_hlen;
@@ -39,7 +41,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
- SKB_GSO_GRE)))
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP)))
goto out;
if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
@@ -47,23 +51,21 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
- if (greh->flags & GRE_KEY)
- ghl += GRE_HEADER_SECTION;
- if (greh->flags & GRE_SEQ)
- ghl += GRE_HEADER_SECTION;
- if (greh->flags & GRE_CSUM) {
- ghl += GRE_HEADER_SECTION;
- csum = true;
- } else
- csum = false;
+ ghl = skb_inner_network_header(skb) - skb_transport_header(skb);
+ if (unlikely(ghl < sizeof(*greh)))
+ goto out;
- /* setup inner skb. */
- skb->protocol = greh->protocol;
- skb->encapsulation = 0;
+ csum = !!(greh->flags & GRE_CSUM);
+ if (csum)
+ skb->encap_hdr_csum = 1;
if (unlikely(!pskb_may_pull(skb, ghl)))
goto out;
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
__skb_pull(skb, ghl);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
@@ -72,8 +74,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
/* segment inner packet. */
enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs))
+ if (!segs || IS_ERR(segs)) {
+ skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
goto out;
+ }
skb = segs;
tnl_hlen = skb_tnl_header_len(skb);
@@ -93,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
}
- greh = (struct gre_base_hdr *)(skb->data);
+ skb_reset_transport_header(skb);
+
+ greh = (struct gre_base_hdr *)
+ skb_transport_header(skb);
pcsum = (__be32 *)(greh + 1);
*pcsum = 0;
- *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0));
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
}
__skb_push(skb, tnl_hlen - ghl);
@@ -112,19 +119,180 @@ out:
return segs;
}
+/* Compute the whole skb csum in s/w and store it, then verify GRO csum
+ * starting from gro_offset.
+ */
+static __sum16 gro_skb_checksum(struct sk_buff *skb)
+{
+ __sum16 sum;
+
+ skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
+ csum_partial(skb->data, skb_gro_offset(skb), 0));
+ sum = csum_fold(NAPI_GRO_CB(skb)->csum);
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
+ if (unlikely(!sum) && !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ } else {
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ }
+
+ return sum;
+}
+
+static struct sk_buff **gre_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct gre_base_hdr *greh;
+ unsigned int hlen, grehlen;
+ unsigned int off;
+ int flush = 1;
+ struct packet_offload *ptype;
+ __be16 type;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*greh);
+ greh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+ }
+
+ /* Only support version 0 and K (key), C (csum) flags. Note that
+ * although the support for the S (seq#) flag can be added easily
+ * for GRO, this is problematic for GSO hence can not be enabled
+ * here because a GRO pkt may end up in the forwarding path, thus
+ * requiring GSO support to break it up correctly.
+ */
+ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+ goto out;
+
+ type = greh->protocol;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (ptype == NULL)
+ goto out_unlock;
+
+ grehlen = GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ hlen = off + grehlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out_unlock;
+ }
+ if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
+ __sum16 csum = 0;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ csum = csum_fold(NAPI_GRO_CB(skb)->csum);
+ /* Don't trust csum error calculated/reported by h/w */
+ if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
+ csum = gro_skb_checksum(skb);
+
+ /* GRE CSUM is the 1's complement of the 1's complement sum
+ * of the GRE hdr plus payload so it should add up to 0xffff
+ * (and 0 after csum_fold()) just like the IPv4 hdr csum.
+ */
+ if (csum)
+ goto out_unlock;
+ }
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct gre_base_hdr *greh2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ /* The following checks are needed to ensure only pkts
+ * from the same tunnel are considered for aggregation.
+ * The criteria for "the same tunnel" includes:
+ * 1) same version (we only support version 0 here)
+ * 2) same protocol (we only support ETH_P_IP for now)
+ * 3) same set of flags
+ * 4) same key if the key field is present.
+ */
+ greh2 = (struct gre_base_hdr *)(p->data + off);
+
+ if (greh2->flags != greh->flags ||
+ greh2->protocol != greh->protocol) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (greh->flags & GRE_KEY) {
+ /* compare keys */
+ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+ }
+
+ skb_gro_pull(skb, grehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+ struct packet_offload *ptype;
+ unsigned int grehlen = sizeof(*greh);
+ int err = -ENOENT;
+ __be16 type;
+
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+ type = greh->protocol;
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype != NULL)
+ err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+ rcu_read_unlock();
+ return err;
+}
+
static const struct net_offload gre_offload = {
.callbacks = {
.gso_send_check = gre_gso_send_check,
.gso_segment = gre_gso_segment,
+ .gro_receive = gre_gro_receive,
+ .gro_complete = gre_gro_complete,
},
};
-int __init gre_offload_init(void)
+static int __init gre_offload_init(void)
{
return inet_add_offload(&gre_offload, IPPROTO_GRE);
}
-
-void __exit gre_offload_exit(void)
-{
- inet_del_offload(&gre_offload, IPPROTO_GRE);
-}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5f7d11a4587..42b7bcf8045 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct sock *sk;
struct inet_sock *inet;
__be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
@@ -349,10 +350,14 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
icmp_param->data.icmph.checksum = 0;
inet->tos = ip_hdr(skb)->tos;
+ sk->sk_mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
if (icmp_param->replyopts.opt.opt.optlen) {
ipc.opt = &icmp_param->replyopts.opt;
if (ipc.opt->opt.srr)
@@ -361,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -379,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
const struct iphdr *iph,
- __be32 saddr, u8 tos,
+ __be32 saddr, u8 tos, u32 mark,
int type, int code,
struct icmp_bxm *param)
{
@@ -391,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->daddr = (param->replyopts.opt.opt.srr ?
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
@@ -488,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
struct flowi4 fl4;
__be32 saddr;
u8 tos;
+ u32 mark;
struct net *net;
struct sock *sk;
@@ -589,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -605,11 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
icmp_param->skb = skb_in;
icmp_param->offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param->replyopts.opt;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
- rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, icmp_param);
if (IS_ERR(rt))
goto out_unlock;
@@ -663,6 +675,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
rcu_read_unlock();
}
+static bool icmp_tag_validation(int proto)
+{
+ bool ok;
+
+ rcu_read_lock();
+ ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+ rcu_read_unlock();
+ return ok;
+}
+
/*
* Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
* ICMP_PARAMETERPROB.
@@ -700,13 +722,23 @@ static void icmp_unreach(struct sk_buff *skb)
case ICMP_PORT_UNREACH:
break;
case ICMP_FRAG_NEEDED:
- if (ipv4_config.no_pmtu_disc) {
+ /* for documentation of the ip_no_pmtu_disc
+ * values please see
+ * Documentation/networking/ip-sysctl.txt
+ */
+ switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+ default:
LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
&iph->daddr);
- } else {
- info = ntohs(icmph->un.frag.mtu);
- if (!info)
+ break;
+ case 2:
+ goto out;
+ case 3:
+ if (!icmp_tag_validation(iph->protocol))
goto out;
+ /* fall through */
+ case 0:
+ info = ntohs(icmph->un.frag.mtu);
}
break;
case ICMP_SR_FAILED:
@@ -881,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb)
ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto csum_error;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
if (!pskb_pull(skb, sizeof(*icmph)))
goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d6c0e64ec97..db710b059ba 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -211,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im)
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv = net_random() % max_delay;
+ int tv = prandom_u32() % max_delay;
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
@@ -220,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = net_random() % in_dev->mr_maxdelay;
+ int tv = prandom_u32() % in_dev->mr_maxdelay;
in_dev->mr_gq_running = 1;
if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
@@ -229,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
- int tv = net_random() % delay;
+ int tv = prandom_u32() % delay;
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
@@ -310,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
struct ip_sf_list *psf;
int scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->saddr = fl4.saddr;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(pip, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
((u8 *)&pip[1])[0] = IPOPT_RA;
((u8 *)&pip[1])[1] = 4;
((u8 *)&pip[1])[2] = 0;
@@ -463,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
}
first = 1;
psf_prev = NULL;
- for (psf=*psf_list; psf; psf=psf_next) {
+ for (psf = *psf_list; psf; psf = psf_next) {
__be32 *psrc;
psf_next = psf->sf_next;
@@ -520,7 +520,7 @@ empty_source:
return skb;
if (pmc->crcount || isquery) {
/* make sure we have room for group header */
- if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) {
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
@@ -576,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
struct ip_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf=*ppsf; psf; psf = psf_next) {
+ for (psf = *ppsf; psf; psf = psf_next) {
psf_next = psf->sf_next;
if (psf->sf_crcount == 0) {
if (psf_prev)
@@ -600,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
pmc_next = pmc->next;
if (pmc->sfmode == MCAST_INCLUDE) {
type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
- ip_select_ident(iph, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
@@ -736,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data)
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_timer_expire(unsigned long data)
@@ -749,7 +749,7 @@ static void igmp_ifc_timer_expire(unsigned long data)
igmp_ifc_start_timer(in_dev,
unsolicited_report_interval(in_dev));
}
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
@@ -764,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev)
static void igmp_timer_expire(unsigned long data)
{
- struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct ip_mc_list *im = (struct ip_mc_list *)data;
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
@@ -794,10 +794,10 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
int i, scount;
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++) {
+ for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
@@ -825,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++)
+ for (i = 0; i < nsrcs; i++)
if (srcs[i] == psf->sf_inaddr) {
psf->sf_gsresp = 1;
scount++;
@@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto drop;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
ih = igmp_hdr(skb);
switch (ih->type) {
@@ -1103,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->tomb = im->tomb;
pmc->sources = im->sources;
im->tomb = im->sources = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = pmc->crcount;
}
spin_unlock_bh(&im->lock);
@@ -1121,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
if (pmc->multiaddr == multiaddr)
break;
pmc_prev = pmc;
@@ -1134,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
if (pmc) {
- for (psf=pmc->tomb; psf; psf=psf_next) {
+ for (psf = pmc->tomb; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
@@ -1167,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
- for (; psf; psf=psf_next) {
+ for (; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
@@ -1557,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
int rv = 0;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1630,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[sfmode]--;
}
err = 0;
- for (i=0; i<sfcount; i++) {
+ for (i = 0; i < sfcount; i++) {
int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
@@ -1650,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
IGMP_Unsolicited_Report_Count;
in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
} else if (sf_setstate(pmc) || changerec) {
@@ -1671,7 +1663,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
struct ip_sf_list *psf, *psf_prev;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1699,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc)
struct ip_sf_list *psf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
if (pmc->sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
@@ -1716,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
int new_in, rv;
rv = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (pmc->sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -1726,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
if (!psf->sf_oldin) {
struct ip_sf_list *prev = NULL;
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) {
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
prev = dpsf;
@@ -1748,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next)
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
@@ -1800,7 +1792,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!delta)
pmc->sfcount[sfmode]++;
err = 0;
- for (i=0; i<sfcount; i++) {
+ for (i = 0; i < sfcount; i++) {
err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
@@ -1810,7 +1802,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!delta)
pmc->sfcount[sfmode]--;
- for (j=0; j<i; j++)
+ for (j = 0; j < i; j++)
(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
@@ -1829,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
IGMP_Unsolicited_Report_Count;
in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
} else if (sf_setstate(pmc)) {
@@ -1844,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf, *nextpsf;
- for (psf=pmc->tomb; psf; psf=nextpsf) {
+ for (psf = pmc->tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
pmc->tomb = NULL;
- for (psf=pmc->sources; psf; psf=nextpsf) {
+ for (psf = pmc->sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
@@ -1952,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
rtnl_lock();
in_dev = ip_mc_find_dev(net, imr);
+ if (!in_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list;
(iml = rtnl_dereference(*imlp)) != NULL;
@@ -1969,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
- if (in_dev)
- ip_mc_dec_group(in_dev, group);
+ ip_mc_dec_group(in_dev, group);
rtnl_unlock();
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
return 0;
}
- if (!in_dev)
- ret = -ENODEV;
+out:
rtnl_unlock();
return ret;
}
@@ -2043,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -2062,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
- for (j=i+1; j<psl->sl_count; j++)
+ for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
@@ -2088,7 +2082,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
newpsl->sl_max = count;
newpsl->sl_count = count - IP_SFBLOCK;
if (psl) {
- for (i=0; i<psl->sl_count; i++)
+ for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
@@ -2098,7 +2092,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -2106,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
if (rv == 0) /* address already there is an error */
goto done;
- for (j=psl->sl_count-1; j>=i; j--)
+ for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = mreqs->imr_sourceaddr;
psl->sl_count++;
@@ -2305,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
return -EFAULT;
}
- for (i=0; i<copycount; i++) {
+ for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
@@ -2350,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
if (!psl)
goto unlock;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
@@ -2423,7 +2417,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
rv = 1;
} else if (im) {
if (src_addr) {
- for (psf=im->sources; psf; psf=psf->sf_next) {
+ for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
}
@@ -2762,6 +2756,7 @@ static struct pernet_operations igmp_net_ops = {
.init = igmp_net_init,
.exit = igmp_net_exit,
};
+#endif
static int igmp_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -2785,8 +2780,9 @@ static struct notifier_block igmp_notifier = {
.notifier_call = igmp_netdev_event,
};
-int __init igmp_mc_proc_init(void)
+int __init igmp_mc_init(void)
{
+#if defined(CONFIG_PROC_FS)
int err;
err = register_pernet_subsys(&igmp_net_ops);
@@ -2800,5 +2796,7 @@ int __init igmp_mc_proc_init(void)
reg_notif_fail:
unregister_pernet_subsys(&igmp_net_ops);
return err;
-}
+#else
+ return register_netdevice_notifier(&igmp_notifier);
#endif
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6acb541c909..14d02ea905b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,27 +29,16 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
-/*
- * This struct holds the first and last local port number.
- */
-struct local_ports sysctl_local_ports __read_mostly = {
- .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
- .range = { 32768, 61000 },
-};
-
-unsigned long *sysctl_local_reserved_ports;
-EXPORT_SYMBOL(sysctl_local_reserved_ports);
-
-void inet_get_local_port_range(int *low, int *high)
+void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
do {
- seq = read_seqbegin(&sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
- *low = sysctl_local_ports.range[0];
- *high = sysctl_local_ports.range[1];
- } while (read_seqretry(&sysctl_local_ports.lock, seq));
+ *low = net->ipv4.ip_local_ports.range[0];
+ *high = net->ipv4.ip_local_ports.range[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
EXPORT_SYMBOL(inet_get_local_port_range);
@@ -79,17 +68,16 @@ int inet_csk_bind_conflict(const struct sock *sk,
(!reuseport || !sk2->sk_reuseport ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
- const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
- sk2_rcv_saddr == sk_rcv_saddr(sk))
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
- const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
- sk2_rcv_saddr == sk_rcv_saddr(sk))
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
}
@@ -116,13 +104,13 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
int remaining, rover, low, high;
again:
- inet_get_local_port_range(&low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- smallest_rover = rover = net_random() % remaining + low;
+ smallest_rover = rover = prandom_u32() % remaining + low;
smallest_size = -1;
do {
- if (inet_is_reserved_local_port(rover))
+ if (inet_is_local_reserved_port(net, rover))
goto next_nolock;
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
@@ -417,12 +405,12 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
struct net *net = sock_net(sk);
int flags = inet_sk_flowi_flags(sk);
- flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol,
flags,
- (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
- ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -454,11 +442,11 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
rcu_read_lock();
opt = rcu_dereference(newinet->inet_opt);
- flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
- (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
- ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -504,9 +492,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
prev = &req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
- if (ireq->rmt_port == rport &&
- ireq->rmt_addr == raddr &&
- ireq->loc_addr == laddr &&
+ if (ireq->ir_rmt_port == rport &&
+ ireq->ir_rmt_addr == raddr &&
+ ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
WARN_ON(req->sk);
*prevp = prev;
@@ -523,7 +511,8 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+ const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
+ inet_rsk(req)->ir_rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
@@ -683,11 +672,13 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;
- inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
- inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
- inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+ inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+ inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5f648751fce..e34dccbc4d7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -106,6 +106,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->id.idiag_sport = inet->inet_sport;
r->id.idiag_dport = inet->inet_dport;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
@@ -121,13 +125,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- const struct ipv6_pinfo *np = inet6_sk(sk);
- *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = np->daddr;
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
goto errout;
}
#endif
@@ -222,7 +226,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- long tmo;
+ s32 tmo;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
@@ -234,32 +238,36 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_ttd - jiffies;
+ tmo = tw->tw_ttd - inet_tw_time_stamp();
if (tmo < 0)
tmo = 0;
r->idiag_family = tw->tw_family;
r->idiag_retrans = 0;
+
r->id.idiag_if = tw->tw_bound_dev_if;
sock_diag_save_cookie(tw, r->id.idiag_cookie);
+
r->id.idiag_sport = tw->tw_sport;
r->id.idiag_dport = tw->tw_dport;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
r->id.idiag_src[0] = tw->tw_rcv_saddr;
r->id.idiag_dst[0] = tw->tw_daddr;
+
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ);
+ r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- const struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
-
- *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
+ *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
}
#endif
@@ -273,10 +281,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
- skb, r, portid, seq, nlmsg_flags,
- unlh);
- return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+ nlmsg_flags, unlh);
+
+ return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
+ nlmsg_flags, unlh);
}
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -338,12 +347,9 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
err = 0;
out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put((struct inet_timewait_sock *)sk);
- else
- sock_put(sk);
- }
+ if (sk)
+ sock_gen_put(sk);
+
out_nosk:
return err;
}
@@ -489,10 +495,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry.family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
+ entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry.daddr = sk->sk_v6_daddr.s6_addr32;
} else
#endif
{
@@ -635,22 +640,22 @@ static int inet_csk_diag_dump(struct sock *sk,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
-static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+
if (bc != NULL) {
struct inet_diag_entry entry;
entry.family = tw->tw_family;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
- entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw6->tw_v6_daddr.s6_addr32;
+ entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
+ entry.daddr = tw->tw_v6_daddr.s6_addr32;
} else
#endif
{
@@ -682,12 +687,12 @@ static inline void inet_diag_req_addrs(const struct sock *sk,
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6) {
if (req->rsk_ops->family == AF_INET6) {
- entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32;
- entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32;
+ entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
+ entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
} else if (req->rsk_ops->family == AF_INET) {
- ipv6_addr_set_v4mapped(ireq->loc_addr,
+ ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
&entry->saddr_storage);
- ipv6_addr_set_v4mapped(ireq->rmt_addr,
+ ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
&entry->daddr_storage);
entry->saddr = entry->saddr_storage.s6_addr32;
entry->daddr = entry->daddr_storage.s6_addr32;
@@ -695,8 +700,8 @@ static inline void inet_diag_req_addrs(const struct sock *sk,
} else
#endif
{
- entry->saddr = &ireq->loc_addr;
- entry->daddr = &ireq->rmt_addr;
+ entry->saddr = &ireq->ir_loc_addr;
+ entry->daddr = &ireq->ir_rmt_addr;
}
}
@@ -731,9 +736,14 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
tmo = 0;
r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = ireq->rmt_port;
- r->id.idiag_src[0] = ireq->loc_addr;
- r->id.idiag_dst[0] = ireq->rmt_addr;
+ r->id.idiag_dport = ireq->ir_rmt_port;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = ireq->ir_loc_addr;
+ r->id.idiag_dst[0] = ireq->ir_rmt_addr;
+
r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
@@ -792,13 +802,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (reqnum < s_reqnum)
continue;
- if (r->id.idiag_dport != ireq->rmt_port &&
+ if (r->id.idiag_dport != ireq->ir_rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
inet_diag_req_addrs(sk, req, &entry);
- entry.dport = ntohs(ireq->rmt_port);
+ entry.dport = ntohs(ireq->ir_rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
@@ -911,8 +921,7 @@ skip_listen_ht:
num = 0;
- if (hlist_nulls_empty(&head->chain) &&
- hlist_nulls_empty(&head->twchain))
+ if (hlist_nulls_empty(&head->chain))
continue;
if (i > s_i)
@@ -920,24 +929,31 @@ skip_listen_ht:
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
+ int res;
+ int state;
if (!net_eq(sock_net(sk), net))
continue;
if (num < s_num)
goto next_normal;
- if (!(r->idiag_states & (1 << sk->sk_state)))
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_substate : sk->sk_state;
+ if (!(r->idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
+ sk->sk_family != r->sdiag_family)
goto next_normal;
- if (r->id.idiag_sport != inet->inet_sport &&
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
r->id.idiag_sport)
goto next_normal;
- if (r->id.idiag_dport != inet->inet_dport &&
+ if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
+ else
+ res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+ if (res < 0) {
spin_unlock_bh(lock);
goto done;
}
@@ -945,33 +961,6 @@ next_normal:
++num;
}
- if (r->idiag_states & TCPF_TIME_WAIT) {
- struct inet_timewait_sock *tw;
-
- inet_twsk_for_each(tw, node,
- &head->twchain) {
- if (!net_eq(twsk_net(tw), net))
- continue;
-
- if (num < s_num)
- goto next_dying;
- if (r->sdiag_family != AF_UNSPEC &&
- tw->tw_family != r->sdiag_family)
- goto next_dying;
- if (r->id.idiag_sport != tw->tw_sport &&
- r->id.idiag_sport)
- goto next_dying;
- if (r->id.idiag_dport != tw->tw_dport &&
- r->id.idiag_dport)
- goto next_dying;
- if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
- spin_unlock_bh(lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
spin_unlock_bh(lock);
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c5313a9c019..3b01959bf4b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -93,9 +93,6 @@ void inet_frags_init(struct inet_frags *f)
}
rwlock_init(&f->lock);
- f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^
- (jiffies ^ (jiffies >> 6)));
-
setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
(unsigned long)f);
f->secret_timer.expires = jiffies + f->secret_interval;
@@ -211,7 +208,7 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
}
work = frag_mem_limit(nf) - nf->low_thresh;
- while (work > 0) {
+ while (work > 0 || force) {
spin_lock(&nf->lru_lock);
if (list_empty(&nf->lru_list)) {
@@ -281,9 +278,10 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
+ inet_frag_lru_add(nf, qp);
spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
- inet_frag_lru_add(nf, qp);
+
return qp;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7bd8983dbfc..43116e8c8e1 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -24,6 +24,31 @@
#include <net/secure_seq.h>
#include <net/ip.h>
+static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 inet_ehash_secret __read_mostly;
+
+ net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
+}
+
+
+static unsigned int inet_sk_ehashfn(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const __be32 laddr = inet->inet_rcv_saddr;
+ const __u16 lport = inet->inet_num;
+ const __be32 faddr = inet->inet_daddr;
+ const __be16 fport = inet->inet_dport;
+ struct net *net = sock_net(sk);
+
+ return inet_ehashfn(net, laddr, lport, faddr, fport);
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -230,13 +255,26 @@ begin:
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_free(inet_twsk(sk));
+ else
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
const int dif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr)
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
struct sock *sk;
const struct hlist_nulls_node *node;
@@ -255,13 +293,13 @@ begin:
if (likely(INET_MATCH(sk, net, acookie,
saddr, daddr, ports, dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
- goto begintw;
+ goto out;
if (unlikely(!INET_MATCH(sk, net, acookie,
saddr, daddr, ports, dif))) {
- sock_put(sk);
+ sock_gen_put(sk);
goto begin;
}
- goto out;
+ goto found;
}
}
/*
@@ -271,37 +309,9 @@ begin:
*/
if (get_nulls_value(node) != slot)
goto begin;
-
-begintw:
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_nulls_for_each_rcu(sk, node, &head->twchain) {
- if (sk->sk_hash != hash)
- continue;
- if (likely(INET_TW_MATCH(sk, net, acookie,
- saddr, daddr, ports,
- dif))) {
- if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
- sk = NULL;
- goto out;
- }
- if (unlikely(!INET_TW_MATCH(sk, net, acookie,
- saddr, daddr, ports,
- dif))) {
- sock_put(sk);
- goto begintw;
- }
- goto out;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begintw;
- sk = NULL;
out:
+ sk = NULL;
+found:
rcu_read_unlock();
return sk;
}
@@ -317,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
@@ -326,39 +336,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
- struct inet_timewait_sock *tw;
+ struct inet_timewait_sock *tw = NULL;
int twrefcnt = 0;
spin_lock(lock);
- /* Check TIME-WAIT sockets first. */
- sk_nulls_for_each(sk2, node, &head->twchain) {
- if (sk2->sk_hash != hash)
- continue;
-
- if (likely(INET_TW_MATCH(sk2, net, acookie,
- saddr, daddr, ports, dif))) {
- tw = inet_twsk(sk2);
- if (twsk_unique(sk, sk2, twp))
- goto unique;
- else
- goto not_unique;
- }
- }
- tw = NULL;
-
- /* And established part... */
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash)
continue;
+
if (likely(INET_MATCH(sk2, net, acookie,
- saddr, daddr, ports, dif)))
+ saddr, daddr, ports, dif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ break;
+ }
goto not_unique;
+ }
}
-unique:
/* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
+ * in hash table socket with a funny identity.
+ */
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
@@ -494,13 +494,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 offset = hint + port_offset;
struct inet_timewait_sock *tw = NULL;
- inet_get_local_port_range(&low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
- if (inet_is_reserved_local_port(port))
+ if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 1975f52933c..f17ea49b28f 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -230,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
lro_desc->last_skb = skb;
}
-static void lro_add_frags(struct net_lro_desc *lro_desc,
- int len, int hlen, int truesize,
- struct skb_frag_struct *skb_frags,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct sk_buff *skb = lro_desc->parent;
- int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
- skb->truesize += truesize;
-
- skb_frags[0].page_offset += hlen;
- skb_frag_size_sub(&skb_frags[0], hlen);
-
- while (tcp_data_len > 0) {
- *(lro_desc->next_frag) = *skb_frags;
- tcp_data_len -= skb_frag_size(skb_frags);
- lro_desc->next_frag++;
- skb_frags++;
- skb_shinfo(skb)->nr_frags++;
- }
-}
static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
struct iphdr *iph,
@@ -371,128 +348,6 @@ out:
return 1;
}
-
-static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- void *mac_hdr,
- int hlen, __wsum sum,
- u32 ip_summed)
-{
- struct sk_buff *skb;
- struct skb_frag_struct *skb_frags;
- int data_len = len;
- int hdr_len = min(len, hlen);
-
- skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad);
- if (!skb)
- return NULL;
-
- skb_reserve(skb, lro_mgr->frag_align_pad);
- skb->len = len;
- skb->data_len = len - hdr_len;
- skb->truesize += true_size;
- skb->tail += hdr_len;
-
- memcpy(skb->data, mac_hdr, hdr_len);
-
- skb_frags = skb_shinfo(skb)->frags;
- while (data_len > 0) {
- *skb_frags = *frags;
- data_len -= skb_frag_size(frags);
- skb_frags++;
- frags++;
- skb_shinfo(skb)->nr_frags++;
- }
-
- skb_shinfo(skb)->frags[0].page_offset += hdr_len;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len);
-
- skb->ip_summed = ip_summed;
- skb->csum = sum;
- skb->protocol = eth_type_trans(skb, lro_mgr->dev);
- return skb;
-}
-
-static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- void *priv, __wsum sum)
-{
- struct net_lro_desc *lro_desc;
- struct iphdr *iph;
- struct tcphdr *tcph;
- struct sk_buff *skb;
- u64 flags;
- void *mac_hdr;
- int mac_hdr_len;
- int hdr_len = LRO_MAX_PG_HLEN;
- int vlan_hdr_len = 0;
-
- if (!lro_mgr->get_frag_header ||
- lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
- (void *)&tcph, &flags, priv)) {
- mac_hdr = skb_frag_address(frags);
- goto out1;
- }
-
- if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
- goto out1;
-
- hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
- mac_hdr_len = (int)((void *)(iph) - mac_hdr);
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (!lro_desc)
- goto out1;
-
- if (!lro_desc->active) { /* start new lro session */
- if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
- goto out1;
-
- skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
- hdr_len, 0, lro_mgr->ip_summed_aggr);
- if (!skb)
- goto out;
-
- if ((skb->protocol == htons(ETH_P_8021Q)) &&
- !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
- vlan_hdr_len = VLAN_HLEN;
-
- iph = (void *)(skb->data + vlan_hdr_len);
- tcph = (void *)((u8 *)skb->data + vlan_hdr_len
- + IP_HDR_LEN(iph));
-
- lro_init_desc(lro_desc, skb, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
- return NULL;
- }
-
- if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
- goto out2;
-
- if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
- goto out2;
-
- lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
-
- if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
- lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
- lro_flush(lro_mgr, lro_desc);
-
- return NULL;
-
-out2: /* send aggregated packets to the stack */
- lro_flush(lro_mgr, lro_desc);
-
-out1: /* Original packet has to be posted to the stack */
- skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
- hdr_len, sum, lro_mgr->ip_summed);
-out:
- return skb;
-}
-
void lro_receive_skb(struct net_lro_mgr *lro_mgr,
struct sk_buff *skb,
void *priv)
@@ -506,23 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
}
EXPORT_SYMBOL(lro_receive_skb);
-void lro_receive_frags(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size, void *priv, __wsum sum)
-{
- struct sk_buff *skb;
-
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum);
- if (!skb)
- return;
-
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(skb);
- else
- netif_rx(skb);
-}
-EXPORT_SYMBOL(lro_receive_frags);
-
void lro_flush_all(struct net_lro_mgr *lro_mgr)
{
int i;
@@ -534,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr)
}
}
EXPORT_SYMBOL(lro_flush_all);
-
-void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct net_lro_desc *lro_desc;
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (lro_desc->active)
- lro_flush(lro_mgr, lro_desc);
-}
-EXPORT_SYMBOL(lro_flush_pkt);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1f27c9f4afd..6d592f8555f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -87,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
refcnt += inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
-#ifdef SOCK_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- pr_debug("%s timewait_sock %p refcnt=%d\n",
- tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
- }
-#endif
- while (refcnt) {
- inet_twsk_put(tw);
- refcnt--;
- }
+ BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
+ atomic_sub(refcnt, &tw->tw_refcnt);
}
-static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
+void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
twsk_destructor((struct sock *)tw);
@@ -118,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+ struct hlist_nulls_head *list)
+{
+ hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind_node, list);
+}
+
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -146,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
/*
- * Step 2: Hash TW into TIMEWAIT chain.
- * Should be done before removing sk from established chain
- * because readers are lockless and search established first.
+ * Step 2: Hash TW into tcp ehash chain.
+ * Notes :
+ * - tw_refcnt is set to 3 because :
+ * - We have one reference from bhash chain.
+ * - We have one reference from ehash chain.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
*/
- inet_twsk_add_node_rcu(tw, &ehead->twchain);
+ atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ inet_twsk_add_node_rcu(tw, &ehead->chain);
- /* Step 3: Remove SK from established hash. */
+ /* Step 3: Remove SK from hash chain */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- /*
- * Notes :
- * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
- * - We add one reference for the bhash link
- * - We add one reference for the ehash link
- * - We want this refcnt update done before allowing other
- * threads to find this tw in ehash chain.
- */
- atomic_add(1 + 1 + 1, &tw->tw_refcnt);
-
spin_unlock(lock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -387,11 +386,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
if (slot >= INET_TWDR_TWKILL_SLOTS)
slot = INET_TWDR_TWKILL_SLOTS - 1;
}
- tw->tw_ttd = jiffies + timeo;
+ tw->tw_ttd = inet_tw_time_stamp() + timeo;
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
list = &twdr->cells[slot];
} else {
- tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+ tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
if (twdr->twcal_hand < 0) {
twdr->twcal_hand = 0;
@@ -490,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
restart_rcu:
rcu_read_lock();
restart:
- sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_state != TCP_TIME_WAIT)
+ continue;
tw = inet_twsk(sk);
if ((tw->tw_family != family) ||
atomic_read(&twsk_net(tw)->count))
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 000e3d239d6..bd5f5928167 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -26,20 +26,7 @@
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
- * At this moment this information consists only of ID field for the next
- * outgoing IP packet. This field is incremented with each packet as encoded
- * in inet_getid() function (include/net/inetpeer.h).
- * At the moment of writing this notes identifier of IP packets is generated
- * to be unpredictable using this code only for packets subjected
- * (actually or potentially) to defragmentation. I.e. DF packets less than
- * PMTU in size uses a constant ID and do not use this code (see
- * ip_select_ident() in include/net/ip.h).
*
- * Route cache entries hold references to our nodes.
- * New cache entries get references via lookup by destination IP address in
- * the avl tree. The reference is grabbed only when it's needed i.e. only
- * when we try to output IP packet which needs an unpredictable ID (see
- * __ip_select_ident() in net/ipv4/route.c).
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
@@ -62,7 +49,6 @@
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
* daddr: unchangeable
- * ip_id_count: atomic value (no lock needed)
*/
static struct kmem_cache *peer_cachep __read_mostly;
@@ -109,13 +95,6 @@ static inline void flush_check(struct inet_peer_base *base, int family)
}
}
-void inetpeer_invalidate_family(int family)
-{
- atomic_t *fp = inetpeer_seq_ptr(family);
-
- atomic_inc(fp);
-}
-
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
/* Exported for sysctl_net_ipv4. */
@@ -127,7 +106,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
static void inetpeer_gc_worker(struct work_struct *work)
{
struct inet_peer *p, *n, *c;
- LIST_HEAD(list);
+ struct list_head list;
spin_lock_bh(&gc_lock);
list_replace_init(&gc_list, &list);
@@ -227,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a,
stackptr = _stack; \
*stackptr++ = &_base->root; \
for (u = rcu_deref_locked(_base->root, _base); \
- u != peer_avl_empty; ) { \
+ u != peer_avl_empty;) { \
int cmp = addr_compare(_daddr, &u->daddr); \
if (cmp == 0) \
break; \
@@ -282,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
*stackptr++ = &start->avl_left; \
v = &start->avl_left; \
for (u = rcu_deref_locked(*v, base); \
- u->avl_right != peer_avl_empty_rcu; ) { \
+ u->avl_right != peer_avl_empty_rcu;) { \
v = &u->avl_right; \
*stackptr++ = v; \
u = rcu_deref_locked(*v, base); \
@@ -504,10 +483,6 @@ relookup:
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count,
- (daddr->family == AF_INET) ?
- secure_ip_id(daddr->addr.a4) :
- secure_ipv6_id(daddr->addr.a6));
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
/* 60*HZ is arbitrary, but chosen enough high so that the first
@@ -529,7 +504,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
p->dtime = (__u32)jiffies;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&p->refcnt);
}
EXPORT_SYMBOL_GPL(inet_putpeer);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 694de3b7aeb..3a83ce5efa8 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -39,6 +39,24 @@
#include <net/route.h>
#include <net/xfrm.h>
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+ return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+ skb->ignore_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+
static int ip_forward_finish(struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
@@ -54,10 +72,15 @@ static int ip_forward_finish(struct sk_buff *skb)
int ip_forward(struct sk_buff *skb)
{
+ u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
+ /* that should never happen */
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
if (skb_warn_if_lro(skb))
goto drop;
@@ -67,9 +90,6 @@ int ip_forward(struct sk_buff *skb)
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
- if (skb->pkt_type != PACKET_HOST)
- goto drop;
-
skb_forward_csum(skb);
/*
@@ -88,11 +108,12 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
- if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
- (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->dst)));
+ htonl(mtu));
goto drop;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b66910aaef4..ed32313e307 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -106,6 +106,7 @@ struct ip4_create_arg {
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
+ net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
@@ -231,8 +232,9 @@ static void ip_expire(unsigned long arg)
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (qp->user == IP_DEFRAG_AF_PACKET ||
- (qp->user == IP_DEFRAG_CONNTRACK_IN &&
- skb_rtable(head)->rt_type != RTN_LOCAL))
+ ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
+ (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL)))
goto out_rcu_unlock;
@@ -703,7 +705,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
if (ip_defrag(skb, user))
return NULL;
- skb->rxhash = 0;
+ skb_clear_hash(skb);
}
}
return skb;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d7aea4c5b94..9b842544aea 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -178,7 +178,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
else
itn = net_generic(net, ipgre_net_id);
- iph = (const struct iphdr *)skb->data;
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->daddr, iph->saddr, tpi->key);
@@ -217,6 +217,7 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ skb_pop_mac_header(skb);
ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
return PACKET_RCVD;
}
@@ -277,7 +278,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
free_skb:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
out:
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
@@ -300,7 +301,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
free_skb:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
out:
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
@@ -409,7 +410,7 @@ static int ipgre_open(struct net_device *dev)
struct flowi4 fl4;
struct rtable *rt;
- rt = ip_route_output_gre(dev_net(dev), &fl4,
+ rt = ip_route_output_gre(t->net, &fl4,
t->parms.iph.daddr,
t->parms.iph.saddr,
t->parms.o_key,
@@ -433,7 +434,7 @@ static int ipgre_close(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
- in_dev = inetdev_by_index(dev_net(dev), t->mlink);
+ in_dev = inetdev_by_index(t->net, t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
}
@@ -462,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
+ dev->type = ARPHRD_IPGRE;
ip_tunnel_setup(dev, ipgre_net_id);
}
@@ -476,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev)
dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
- dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
+ dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
@@ -500,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
- dev->type = ARPHRD_IPGRE;
dev->flags = IFF_NOARP;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
dev->addr_len = 4;
@@ -648,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev)
{
ether_setup(dev);
dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
ip_tunnel_setup(dev, gre_tap_net_id);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 054a3e97d82..3d4da2c16b6 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -314,7 +314,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- if (sysctl_ip_early_demux && !skb_dst(skb)) {
+ if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ec7264514a8..ad382499bac 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -167,7 +167,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
soffset -= 4;
if (soffset > 3) {
memcpy(&faddr, &start[soffset-1], 4);
- for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+ for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
memcpy(&dptr[doffset-1], &start[soffset-1], 4);
/*
* RFC1812 requires to fix illegal source routes.
@@ -227,7 +227,7 @@ void ip_options_fragment(struct sk_buff *skb)
continue;
}
optlen = optptr[1];
- if (optlen<2 || optlen>l)
+ if (optlen < 2 || optlen > l)
return;
if (!IPOPT_COPIED(*optptr))
memset(optptr, IPOPT_NOOP, optlen);
@@ -275,27 +275,31 @@ int ip_options_compile(struct net *net,
for (l = opt->optlen; l > 0; ) {
switch (*optptr) {
- case IPOPT_END:
- for (optptr++, l--; l>0; optptr++, l--) {
+ case IPOPT_END:
+ for (optptr++, l--; l > 0; optptr++, l--) {
if (*optptr != IPOPT_END) {
*optptr = IPOPT_END;
opt->is_changed = 1;
}
}
goto eol;
- case IPOPT_NOOP:
+ case IPOPT_NOOP:
l--;
optptr++;
continue;
}
+ if (unlikely(l < 2)) {
+ pp_ptr = optptr;
+ goto error;
+ }
optlen = optptr[1];
- if (optlen<2 || optlen>l) {
+ if (optlen < 2 || optlen > l) {
pp_ptr = optptr;
goto error;
}
switch (*optptr) {
- case IPOPT_SSRR:
- case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
if (optlen < 3) {
pp_ptr = optptr + 1;
goto error;
@@ -321,7 +325,7 @@ int ip_options_compile(struct net *net,
opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
opt->srr = optptr - iph;
break;
- case IPOPT_RR:
+ case IPOPT_RR:
if (opt->rr) {
pp_ptr = optptr;
goto error;
@@ -349,7 +353,7 @@ int ip_options_compile(struct net *net,
}
opt->rr = optptr - iph;
break;
- case IPOPT_TIMESTAMP:
+ case IPOPT_TIMESTAMP:
if (opt->ts) {
pp_ptr = optptr;
goto error;
@@ -364,19 +368,19 @@ int ip_options_compile(struct net *net,
}
if (optptr[2] <= optlen) {
unsigned char *timeptr = NULL;
- if (optptr[2]+3 > optptr[1]) {
+ if (optptr[2]+3 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
switch (optptr[3]&0xF) {
- case IPOPT_TS_TSONLY:
+ case IPOPT_TS_TSONLY:
if (skb)
timeptr = &optptr[optptr[2]-1];
opt->ts_needtime = 1;
optptr[2] += 4;
break;
- case IPOPT_TS_TSANDADDR:
- if (optptr[2]+7 > optptr[1]) {
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
@@ -389,8 +393,8 @@ int ip_options_compile(struct net *net,
opt->ts_needtime = 1;
optptr[2] += 8;
break;
- case IPOPT_TS_PRESPEC:
- if (optptr[2]+7 > optptr[1]) {
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
@@ -405,7 +409,7 @@ int ip_options_compile(struct net *net,
opt->ts_needtime = 1;
optptr[2] += 8;
break;
- default:
+ default:
if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr + 3;
goto error;
@@ -433,7 +437,7 @@ int ip_options_compile(struct net *net,
}
opt->ts = optptr - iph;
break;
- case IPOPT_RA:
+ case IPOPT_RA:
if (optlen < 4) {
pp_ptr = optptr + 1;
goto error;
@@ -441,7 +445,7 @@ int ip_options_compile(struct net *net,
if (optptr[2] == 0 && optptr[3] == 0)
opt->router_alert = optptr - iph;
break;
- case IPOPT_CIPSO:
+ case IPOPT_CIPSO:
if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
pp_ptr = optptr;
goto error;
@@ -452,9 +456,9 @@ int ip_options_compile(struct net *net,
goto error;
}
break;
- case IPOPT_SEC:
- case IPOPT_SID:
- default:
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr;
goto error;
@@ -572,7 +576,7 @@ void ip_forward_options(struct sk_buff *skb)
optptr = raw + opt->srr;
- for ( srrptr=optptr[2], srrspace = optptr[1];
+ for ( srrptr = optptr[2], srrspace = optptr[1];
srrptr <= srrspace;
srrptr += 4
) {
@@ -628,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
if (rt->rt_type != RTN_LOCAL)
return -EINVAL;
- for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
if (srrptr + 3 > srrspace) {
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
return -EINVAL;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9ee17e3d11c..8d3b6b0e985 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb)
skb_dst(skb)->dev, dst_output);
}
-int ip_local_out(struct sk_buff *skb)
+int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
{
int err;
err = __ip_local_out(skb);
if (likely(err == 1))
- err = dst_output(skb);
+ err = dst_output_sk(sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out);
+EXPORT_SYMBOL_GPL(ip_local_out_sk);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(iph, &rt->dst, sk);
+ ip_select_ident(skb, sk);
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
+static int ip_finish_output_gso(struct sk_buff *skb)
+{
+ netdev_features_t features;
+ struct sk_buff *segs;
+ int ret = 0;
+
+ /* common case: locally created skb or seglen is <= mtu */
+ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
+ skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
+ return ip_finish_output2(skb);
+
+ /* Slowpath - GSO segment length is exceeding the dst MTU.
+ *
+ * This can happen in two cases:
+ * 1) TCP GRO packet, DF bit not set
+ * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
+ * from host network stack.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = ip_fragment(segs, ip_finish_output2);
+
+ if (err && ret == 0)
+ ret = err;
+ segs = nskb;
+ } while (segs);
+
+ return ret;
+}
+
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb)
return dst_output(skb);
}
#endif
- if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
+ if (skb_is_gso(skb))
+ return ip_finish_output_gso(skb);
+
+ if (skb->len > ip_skb_dst_mtu(skb))
return ip_fragment(skb, ip_finish_output2);
- else
- return ip_finish_output2(skb);
+
+ return ip_finish_output2(skb);
}
-int ip_mc_output(struct sk_buff *skb)
+int ip_mc_output(struct sock *sk, struct sk_buff *skb)
{
- struct sock *sk = skb->sk;
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
@@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sk_buff *skb)
+int ip_output(struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
@@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
sizeof(fl4->saddr) + sizeof(fl4->daddr));
}
-int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
+/* Note: skb->sk can be different from sk, in case of tunnels */
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
- struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
@@ -371,7 +415,7 @@ packet_routed:
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
@@ -386,9 +430,9 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_more(iph, &rt->dst, sk,
- (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+ ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
+ /* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
@@ -422,9 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
- to->nf_trace = from->nf_trace;
-#endif
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
to->ipvs_property = from->ipvs_property;
#endif
@@ -458,12 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
iph = ip_hdr(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+ mtu = ip_skb_dst_mtu(skb);
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
(IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
+ IPCB(skb)->frag_max_size > mtu))) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(ip_skb_dst_mtu(skb)));
+ htonl(mtu));
kfree_skb(skb);
return -EMSGSIZE;
}
@@ -473,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
*/
hlen = iph->ihl * 4;
- mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
+ mtu = mtu - hlen; /* Size of data space */
#ifdef CONFIG_BRIDGE_NETFILTER
if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb);
@@ -772,15 +814,20 @@ static inline int ip_ufo_append_data(struct sock *sk,
/* initialize protocol header pointer */
skb->transport_header = skb->network_header + fragheaderlen;
- skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
- /* specify the length of each IP datagram fragment */
- skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
__skb_queue_tail(queue, skb);
+ } else if (skb_is_gso(skb)) {
+ goto append;
}
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ /* specify the length of each IP datagram fragment */
+ skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+append:
return skb_append_datato_frags(sk, skb, getfrag, from,
(length - transhdrlen));
}
@@ -805,7 +852,7 @@ static int __ip_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
- unsigned int maxfraglen, fragheaderlen;
+ unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
@@ -818,10 +865,11 @@ static int __ip_append_data(struct sock *sk,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
- if (cork->length + length > 0xFFFF - fragheaderlen) {
+ if (cork->length + length > maxnonfragsize - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
- mtu-exthdrlen);
+ mtu - (opt ? opt->optlen : 0));
return -EMSGSIZE;
}
@@ -1030,7 +1078,6 @@ error:
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
struct ipcm_cookie *ipc, struct rtable **rtp)
{
- struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *opt;
struct rtable *rt;
@@ -1056,10 +1103,13 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
* We steal reference to this route, caller should not release it
*/
*rtp = NULL;
- cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(&rt->dst);
+ cork->fragsize = ip_sk_use_pmtu(sk) ?
+ dst_mtu(&rt->dst) : rt->dst.dev->mtu;
cork->dst = &rt->dst;
cork->length = 0;
+ cork->ttl = ipc->ttl;
+ cork->tos = ipc->tos;
+ cork->priority = ipc->priority;
cork->tx_flags = ipc->tx_flags;
return 0;
@@ -1114,7 +1164,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
int mtu;
int len;
int err;
- unsigned int maxfraglen, fragheaderlen, fraggap;
+ unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
if (inet->hdrincl)
return -EPERM;
@@ -1138,9 +1188,11 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
- if (cork->length + size > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
+ if (cork->length + size > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
return -EMSGSIZE;
}
@@ -1297,13 +1349,13 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
* to fragment the frame generated here. No matter, what transforms
* how transforms change size of the packet, it will come out.
*/
- if (inet->pmtudisc < IP_PMTUDISC_DO)
- skb->local_df = 1;
+ skb->ignore_df = ip_sk_ignore_df(sk);
/* DF bit is set when we want to see DF on outgoing frames.
- * If local_df is set too, we still allow to fragment this frame
+ * If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+ if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ inet->pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1311,27 +1363,29 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (cork->flags & IPCORK_OPT)
opt = cork->opt;
- if (rt->rt_type == RTN_MULTICAST)
+ if (cork->ttl != 0)
+ ttl = cork->ttl;
+ else if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
ttl = ip_select_ttl(inet, &rt->dst);
- iph = (struct iphdr *)skb->data;
+ iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = inet->tos;
+ iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
iph->frag_off = df;
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
- ip_select_ident(iph, &rt->dst, sk);
+ ip_select_ident(skb, sk);
if (opt) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, cork->addr, rt, 0);
}
- skb->priority = sk->sk_priority;
+ skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
skb->mark = sk->sk_mark;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
@@ -1481,6 +1535,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
ipc.addr = daddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
@@ -1489,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
daddr = replyopts.opt.opt.faddr;
}
- flowi4_init_output(&fl4, arg->bound_dev_if, 0,
+ flowi4_init_output(&fl4, arg->bound_dev_if,
+ IP4_REPLY_MARK(net, skb->mark),
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
@@ -1534,7 +1591,7 @@ void __init ip_init(void)
ip_rt_init();
inet_initpeers();
-#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
- igmp_mc_proc_init();
+#if defined(CONFIG_IP_MULTICAST)
+ igmp_mc_init();
#endif
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d9c4f113d70..64741b93863 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -56,7 +56,6 @@
/*
* SOL_IP control messages.
*/
-#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))
static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
@@ -187,14 +186,31 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
}
EXPORT_SYMBOL(ip_cmsg_recv);
-int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
+int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+ bool allow_ipv6)
{
- int err;
+ int err, val;
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
+#if defined(CONFIG_IPV6)
+ if (allow_ipv6 &&
+ cmsg->cmsg_level == SOL_IPV6 &&
+ cmsg->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *src_info;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+ return -EINVAL;
+ src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+ return -EINVAL;
+ ipc->oif = src_info->ipi6_ifindex;
+ ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+ continue;
+ }
+#endif
if (cmsg->cmsg_level != SOL_IP)
continue;
switch (cmsg->cmsg_type) {
@@ -215,6 +231,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
ipc->addr = info->ipi_spec_dst.s_addr;
break;
}
+ case IP_TTL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->ttl = val;
+ break;
+ case IP_TOS:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ ipc->tos = val;
+ ipc->priority = rt_tos2priority(ipc->tos);
+ break;
+
default:
return -EINVAL;
}
@@ -368,11 +402,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
/*
* Handle MSG_ERRQUEUE
*/
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct sock_exterr_skb *serr;
struct sk_buff *skb, *skb2;
- struct sockaddr_in *sin;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct {
struct sock_extended_err ee;
struct sockaddr_in offender;
@@ -398,13 +432,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
serr = SKB_EXT_ERR(skb);
- sin = (struct sockaddr_in *)msg->msg_name;
if (sin) {
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
serr->addr_offset);
sin->sin_port = serr->port;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
@@ -609,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
inet->nodefrag = val ? 1 : 0;
break;
case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval;
inet->pmtudisc = val;
break;
@@ -1032,13 +1066,15 @@ e_inval:
*
* To support IP_CMSG_PKTINFO option, we store rt_iif and specific
* destination in skb->cb[] before dst drop.
- * This way, receiver doesnt make cache line misses to read rtable.
+ * This way, receiver doesn't make cache line misses to read rtable.
*/
-void ipv4_pktinfo_prepare(struct sk_buff *skb)
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
{
struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
+ ipv6_sk_rxinfo(sk);
- if (skb_rtable(skb)) {
+ if (prepare && skb_rtable(skb)) {
pktinfo->ipi_ifindex = inet_iif(skb);
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ac9fabe0300..6f9de61dce5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -40,6 +40,7 @@
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/rculist.h>
+#include <linux/err.h>
#include <net/sock.h>
#include <net/ip.h>
@@ -61,57 +62,59 @@
#include <net/ip6_route.h>
#endif
-static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
- __be32 key, __be32 remote)
+static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
{
return hash_32((__force u32)key ^ (__force u32)remote,
IP_TNL_HASH_BITS);
}
-/* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
+static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
+ struct dst_entry *dst)
{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+ struct dst_entry *old_dst;
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
+ dst_clone(dst);
+ old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
+ dst_release(old_dst);
+}
- tot->multicast = dev->stats.multicast;
+static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+{
+ __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
+}
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
+static void tunnel_dst_reset(struct ip_tunnel *t)
+{
+ tunnel_dst_set(t, NULL);
+}
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
+void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
+{
+ int i;
- tot->collisions = dev->stats.collisions;
+ for_each_possible_cpu(i)
+ __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
+}
+EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
- return tot;
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
+{
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
+ if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+ dst = NULL;
+ if (dst) {
+ if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ tunnel_dst_reset(t);
+ dst_release(dst);
+ dst = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return (struct rtable *)dst;
}
-EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
__be16 flags, __be32 key)
@@ -146,7 +149,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
struct ip_tunnel *t, *cand = NULL;
struct hlist_head *head;
- hash = ip_tunnel_hash(itn, key, remote);
+ hash = ip_tunnel_hash(key, remote);
head = &itn->tunnels[hash];
hlist_for_each_entry_rcu(t, head, hash_node) {
@@ -166,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
hlist_for_each_entry_rcu(t, head, hash_node) {
if (remote != t->parms.iph.daddr ||
+ t->parms.iph.saddr != 0 ||
!(t->dev->flags & IFF_UP))
continue;
@@ -178,14 +182,15 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
- hash = ip_tunnel_hash(itn, key, 0);
+ hash = ip_tunnel_hash(key, 0);
head = &itn->tunnels[hash];
hlist_for_each_entry_rcu(t, head, hash_node) {
- if ((local != t->parms.iph.saddr &&
- (local != t->parms.iph.daddr ||
- !ipv4_is_multicast(local))) ||
- !(t->dev->flags & IFF_UP))
+ if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+ (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+ continue;
+
+ if (!(t->dev->flags & IFF_UP))
continue;
if (!ip_tunnel_key_match(&t->parms, flags, key))
@@ -202,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
hlist_for_each_entry_rcu(t, head, hash_node) {
if (t->parms.i_key != key ||
+ t->parms.iph.saddr != 0 ||
+ t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
continue;
@@ -228,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
{
unsigned int h;
__be32 remote;
+ __be32 i_key = parms->i_key;
if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
remote = parms->iph.daddr;
else
remote = 0;
- h = ip_tunnel_hash(itn, parms->i_key, remote);
+ if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+ i_key = 0;
+
+ h = ip_tunnel_hash(i_key, remote);
return &itn->tunnels[h];
}
@@ -257,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
+ __be16 flags = parms->i_flags;
int link = parms->link;
struct ip_tunnel *t = NULL;
struct hlist_head *head = ip_bucket(itn, parms);
@@ -264,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
- key == t->parms.i_key &&
link == t->parms.link &&
- type == t->dev->type)
+ type == t->dev->type &&
+ ip_tunnel_key_match(&t->parms, flags, key))
break;
}
return t;
@@ -318,11 +330,10 @@ failed:
return ERR_PTR(err);
}
-static inline struct rtable *ip_route_output_tunnel(struct net *net,
- struct flowi4 *fl4,
- int proto,
- __be32 daddr, __be32 saddr,
- __be32 key, __u8 tos, int oif)
+static inline void init_tunnel_flow(struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
{
memset(fl4, 0, sizeof(*fl4));
fl4->flowi4_oif = oif;
@@ -331,7 +342,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net,
fl4->flowi4_tos = tos;
fl4->flowi4_proto = proto;
fl4->fl4_gre_key = key;
- return ip_route_output_key(net, fl4);
}
static int ip_tunnel_bind_dev(struct net_device *dev)
@@ -350,14 +360,14 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
struct flowi4 fl4;
struct rtable *rt;
- rt = ip_route_output_tunnel(tunnel->net, &fl4,
- tunnel->parms.iph.protocol,
- iph->daddr, iph->saddr,
- tunnel->parms.o_key,
- RT_TOS(iph->tos),
- tunnel->parms.link);
+ init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+ iph->saddr, tunnel->parms.o_key,
+ RT_TOS(iph->tos), tunnel->parms.link);
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
+ tunnel_dst_set(tunnel, &rt->dst);
ip_rt_put(rt);
}
if (dev->type != ARPHRD_ETHER)
@@ -386,14 +396,13 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
struct ip_tunnel_net *itn,
struct ip_tunnel_parm *parms)
{
- struct ip_tunnel *nt, *fbt;
+ struct ip_tunnel *nt;
struct net_device *dev;
BUG_ON(!itn->fb_tunnel_dev);
- fbt = netdev_priv(itn->fb_tunnel_dev);
dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
if (IS_ERR(dev))
- return NULL;
+ return ERR_CAST(dev);
dev->mtu = ip_tunnel_bind_dev(dev);
@@ -405,15 +414,12 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, bool log_ecn_error)
{
- struct pcpu_tstats *tstats;
+ struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
int err;
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
tunnel->dev->stats.multicast++;
skb->pkt_type = PACKET_BROADCAST;
}
@@ -436,6 +442,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tunnel->i_seqno = ntohl(tpi->seq) + 1;
}
+ skb_reset_network_header(skb);
+
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
@@ -454,6 +462,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
if (tunnel->dev->type == ARPHRD_ETHER) {
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
@@ -461,8 +471,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->dev = tunnel->dev;
}
- skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
-
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -532,8 +540,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
int err;
+ bool connected;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ connected = (tunnel->parms.iph.daddr != 0);
dst = tnl_params->daddr;
if (dst == 0) {
@@ -581,27 +591,38 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
#endif
else
goto tx_error;
+
+ connected = false;
}
tos = tnl_params->tos;
if (tos & 0x1) {
tos &= ~0x1;
- if (skb->protocol == htons(ETH_P_IP))
+ if (skb->protocol == htons(ETH_P_IP)) {
tos = inner_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
+ connected = false;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ connected = false;
+ }
}
- rt = ip_route_output_tunnel(tunnel->net, &fl4,
- protocol,
- dst, tnl_params->saddr,
- tunnel->parms.o_key,
- RT_TOS(tos),
- tunnel->parms.link);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error;
+ init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+ tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+ rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
+
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (connected)
+ tunnel_dst_set(tunnel, &rt->dst);
}
+
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
@@ -618,11 +639,13 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
dst_link_failure(skb);
} else
tunnel->err_count = 0;
}
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = tnl_params->ttl;
if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
@@ -641,18 +664,18 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len;
- if (max_headroom > dev->needed_headroom) {
+ if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
- if (skb_cow_head(skb, dev->needed_headroom)) {
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return;
- }
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return;
}
- err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
- ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df,
- !net_eq(tunnel->net, dev_net(dev)));
+ err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
+ tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return;
@@ -663,7 +686,7 @@ tx_error_icmp:
#endif
tx_error:
dev->stats.tx_errors++;
- dev_kfree_skb(skb);
+ kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
@@ -696,25 +719,25 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
if (set_mtu)
dev->mtu = mtu;
}
+ ip_tunnel_dst_reset_all(t);
netdev_state_change(dev);
}
int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
{
int err = 0;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
BUG_ON(!itn->fb_tunnel_dev);
switch (cmd) {
case SIOCGETTUNNEL:
- t = NULL;
- if (dev == itn->fb_tunnel_dev)
+ if (dev == itn->fb_tunnel_dev) {
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
- if (t == NULL)
- t = netdev_priv(dev);
+ if (t == NULL)
+ t = netdev_priv(dev);
+ }
memcpy(p, &t->parms, sizeof(*p));
break;
@@ -725,16 +748,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
goto done;
if (p->iph.ttl)
p->iph.frag_off |= htons(IP_DF);
- if (!(p->i_flags&TUNNEL_KEY))
- p->i_key = 0;
- if (!(p->o_flags&TUNNEL_KEY))
- p->o_key = 0;
+ if (!(p->i_flags & VTI_ISVTI)) {
+ if (!(p->i_flags & TUNNEL_KEY))
+ p->i_key = 0;
+ if (!(p->o_flags & TUNNEL_KEY))
+ p->o_key = 0;
+ }
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
- if (!t && (cmd == SIOCADDTUNNEL))
+ if (!t && (cmd == SIOCADDTUNNEL)) {
t = ip_tunnel_create(net, itn, p);
-
+ err = PTR_ERR_OR_ZERO(t);
+ break;
+ }
if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
if (t != NULL) {
if (t->dev != dev) {
@@ -761,8 +788,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
if (t) {
err = 0;
ip_tunnel_update(itn, t, dev, p, true);
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ } else {
+ err = -ENOENT;
+ }
break;
case SIOCDELTUNNEL:
@@ -811,6 +839,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
gro_cells_destroy(&tunnel->gro_cells);
+ free_percpu(tunnel->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -853,11 +882,14 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
- if (!IS_ERR(itn->fb_tunnel_dev))
+ if (!IS_ERR(itn->fb_tunnel_dev)) {
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+ ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ }
rtnl_unlock();
- return PTR_RET(itn->fb_tunnel_dev);
+ return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
@@ -884,8 +916,6 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
if (!net_eq(dev_net(t->dev), net))
unregister_netdevice_queue(t->dev, head);
}
- if (itn->fb_tunnel_dev)
- unregister_netdevice_queue(itn->fb_tunnel_dev, head);
}
void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
@@ -979,12 +1009,19 @@ int ip_tunnel_init(struct net_device *dev)
int err;
dev->destructor = ip_tunnel_dev_free;
- dev->tstats = alloc_percpu(struct pcpu_tstats);
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
+ tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
+ if (!tunnel->dst_cache) {
+ free_percpu(dev->tstats);
+ return -ENOMEM;
+ }
+
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
+ free_percpu(tunnel->dst_cache);
free_percpu(dev->tstats);
return err;
}
@@ -1009,6 +1046,8 @@ void ip_tunnel_uninit(struct net_device *dev)
/* fb_tunnel_dev will be unregisted in net-exit call. */
if (itn->fb_tunnel_dev != dev)
ip_tunnel_del(netdev_priv(dev));
+
+ ip_tunnel_dst_reset_all(tunnel);
}
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index d6c856b17fd..f4c987bb7e9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -46,7 +46,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
-int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
+int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
{
@@ -56,12 +56,12 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
skb_scrub_packet(skb, xnet);
- skb->rxhash = 0;
+ skb_clear_hash(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
/* Push down and install the IP header. */
- __skb_push(skb, sizeof(struct iphdr));
+ skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
@@ -74,9 +74,9 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+ __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
- err = ip_local_out(skb);
+ err = ip_local_out_sk(sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
@@ -107,8 +107,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
nf_reset(skb);
secpath_reset(skb);
- if (!skb->l4_rxhash)
- skb->rxhash = 0;
+ skb_clear_hash_if_not_l4(skb);
skb_dst_drop(skb);
skb->vlan_tci = 0;
skb_set_queue_mapping(skb, 0);
@@ -116,3 +115,90 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
return 0;
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
+ bool csum_help,
+ int gso_type_mask)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= gso_type_mask;
+ return skb;
+ }
+
+ /* If packet is not gso and we are resolving any partial checksum,
+ * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
+ * on the outer header without confusing devices that implement
+ * NETIF_F_IP_CSUM with encapsulation.
+ */
+ if (csum_help)
+ skb->encapsulation = 0;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
+
+/* Often modified stats are per cpu, other are shared (netdev->stats) */
+struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *tstats =
+ per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_frame_errors = dev->stats.rx_frame_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e805e7b3030..b8960f3527f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -34,6 +34,7 @@
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
+#include <linux/icmpv6.h>
#include <net/sock.h>
#include <net/ip.h>
@@ -49,145 +50,131 @@ static struct rtnl_link_ops vti_link_ops __read_mostly;
static int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
-static int vti_err(struct sk_buff *skb, u32 info)
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
{
-
- /* All the routers (except for Linux) return only
- * 8 bytes of packet payload. It means, that precise relaying of
- * ICMP in the real Internet is absolutely infeasible.
- */
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
- struct iphdr *iph = (struct iphdr *)skb->data;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- struct ip_tunnel *t;
- int err;
-
- switch (type) {
- default:
- case ICMP_PARAMETERPROB:
- return 0;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return 0;
- default:
- /* All others are translated to HOST_UNREACH. */
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return 0;
- break;
- }
- err = -ENOENT;
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel != NULL) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
- if (t == NULL)
- goto out;
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+ skb->mark = be32_to_cpu(tunnel->parms.i_key);
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, IPPROTO_IPIP, 0);
- err = 0;
- goto out;
+ return xfrm_input(skb, nexthdr, spi, encap_type);
}
- err = 0;
- if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- goto out;
-
- if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
- t->err_count++;
- else
- t->err_count = 1;
- t->err_time = jiffies;
-out:
- return err;
+ return -EINVAL;
+drop:
+ kfree_skb(skb);
+ return 0;
}
-/* We dont digest the packet therefore let the packet pass */
static int vti_rcv(struct sk_buff *skb)
{
- struct ip_tunnel *tunnel;
- const struct iphdr *iph = ip_hdr(skb);
- struct net *net = dev_net(skb->dev);
- struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->saddr, iph->daddr, 0);
- if (tunnel != NULL) {
- struct pcpu_tstats *tstats;
-
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
- return -1;
+ return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
+}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
- skb->mark = 0;
- secpath_reset(skb);
- skb->dev = tunnel->dev;
+ if (!tunnel)
return 1;
+
+ dev = tunnel->dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
}
- return -1;
+ x = xfrm_input_state(skb);
+ family = x->inner_mode->afinfo->family;
+
+ if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+ skb->dev = dev;
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
}
-/* This function assumes it is being called from dev_queue_xmit()
- * and that skb is filled properly by that function.
- */
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)&src;
-static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET)
+ return false;
+
+ if (!dst)
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+ return false;
+
+ return true;
+}
+
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct flowi *fl)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *tiph = &tunnel->parms.iph;
- u8 tos;
- struct rtable *rt; /* Route to the other host */
+ struct ip_tunnel_parm *parms = &tunnel->parms;
+ struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev; /* Device to other host */
- struct iphdr *old_iph = ip_hdr(skb);
- __be32 dst = tiph->daddr;
- struct flowi4 fl4;
int err;
- if (skb->protocol != htons(ETH_P_IP))
- goto tx_error;
-
- tos = old_iph->tos;
+ if (!dst) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
- memset(&fl4, 0, sizeof(fl4));
- flowi4_init_output(&fl4, tunnel->parms.link,
- be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
- RT_SCOPE_UNIVERSE,
- IPPROTO_IPIP, 0,
- dst, tiph->saddr, 0, 0);
- rt = ip_route_output_key(dev_net(dev), &fl4);
- if (IS_ERR(rt)) {
+ dst_hold(dst);
+ dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
- /* if there is no transform then this tunnel is not functional.
- * Or if the xfrm is not mode tunnel.
- */
- if (!rt->dst.xfrm ||
- rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
+
+ if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
dev->stats.tx_carrier_errors++;
+ dst_release(dst);
goto tx_error_icmp;
}
- tdev = rt->dst.dev;
+
+ tdev = dst->dev;
if (tdev == dev) {
- ip_rt_put(rt);
+ dst_release(dst);
dev->stats.collisions++;
goto tx_error;
}
@@ -201,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
tunnel->err_count = 0;
}
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- nf_reset(skb);
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
err = dst_output(skb);
@@ -217,10 +202,102 @@ tx_error_icmp:
dst_link_failure(skb);
tx_error:
dev->stats.tx_errors++;
- dev_kfree_skb(skb);
+ kfree_skb(skb);
return NETDEV_TX_OK;
}
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+
+ skb->mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ default:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ return vti_xmit(skb, dev, &fl);
+}
+
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ int protocol = iph->protocol;
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!tunnel)
+ return -1;
+
+ mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
static int
vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
@@ -236,12 +313,19 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EINVAL;
}
+ if (!(p.i_flags & GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags & GRE_KEY))
+ p.o_key = 0;
+
+ p.i_flags = VTI_ISVTI;
+
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
if (cmd != SIOCDELTUNNEL) {
- p.i_flags |= GRE_KEY | VTI_ISVTI;
+ p.i_flags |= GRE_KEY;
p.o_flags |= GRE_KEY;
}
@@ -262,6 +346,7 @@ static const struct net_device_ops vti_netdev_ops = {
static void vti_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &vti_netdev_ops;
+ dev->type = ARPHRD_TUNNEL;
ip_tunnel_setup(dev, vti_net_id);
}
@@ -273,13 +358,11 @@ static int vti_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
- dev->type = ARPHRD_TUNNEL;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
dev->mtu = ETH_DATA_LEN;
dev->flags = IFF_NOARP;
dev->iflink = 0;
dev->addr_len = 4;
- dev->features |= NETIF_F_NETNS_LOCAL;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
@@ -296,10 +379,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
iph->ihl = 5;
}
-static struct xfrm_tunnel vti_handler __read_mostly = {
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
.handler = vti_rcv,
- .err_handler = vti_err,
- .priority = 1,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
};
static int __net_init vti_init_net(struct net *net)
@@ -343,6 +444,8 @@ static void vti_netlink_parms(struct nlattr *data[],
if (!data)
return;
+ parms->i_flags = VTI_ISVTI;
+
if (data[IFLA_VTI_LINK])
parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
@@ -438,10 +541,31 @@ static int __init vti_init(void)
err = register_pernet_device(&vti_net_ops);
if (err < 0)
return err;
- err = xfrm4_mode_tunnel_input_register(&vti_handler);
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
if (err < 0) {
unregister_pernet_device(&vti_net_ops);
pr_info("vti init: can't register tunnel\n");
+
+ return err;
+ }
+
+ err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+ if (err < 0) {
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+ pr_info("vti init: can't register tunnel\n");
+
+ return err;
+ }
+
+ err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0) {
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+ pr_info("vti init: can't register tunnel\n");
+
+ return err;
}
err = rtnl_link_register(&vti_link_ops);
@@ -451,7 +575,9 @@ static int __init vti_init(void)
return err;
rtnl_link_failed:
- xfrm4_mode_tunnel_input_deregister(&vti_handler);
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
return err;
}
@@ -459,8 +585,13 @@ rtnl_link_failed:
static void __exit vti_fini(void)
{
rtnl_link_unregister(&vti_link_ops);
- if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
+ if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
+ pr_info("vti close: can't deregister tunnel\n");
+ if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
pr_info("vti close: can't deregister tunnel\n");
+ if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
+ pr_info("vti close: can't deregister tunnel\n");
+
unregister_pernet_device(&vti_net_ops);
}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 826be4cb482..c0855d50a3f 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -23,7 +23,7 @@
#include <net/protocol.h>
#include <net/sock.h>
-static void ipcomp4_err(struct sk_buff *skb, u32 info)
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
__be32 spi;
@@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
switch (icmp_hdr(skb)->type) {
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ return 0;
case ICMP_REDIRECT:
break;
default:
- return;
+ return 0;
}
spi = htonl(ntohs(ipch->cpi));
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
spi, IPPROTO_COMP, AF_INET);
if (!x)
- return;
+ return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
else
ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
xfrm_state_put(x);
+
+ return 0;
}
/* We always hold one tunnel user reference to indicate a tunnel */
@@ -147,6 +149,11 @@ out:
return err;
}
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
static const struct xfrm_type ipcomp_type = {
.description = "IPCOMP4",
.owner = THIS_MODULE,
@@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = {
.output = ipcomp_output
};
-static const struct net_protocol ipcomp4_protocol = {
+static struct xfrm4_protocol ipcomp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ipcomp4_rcv_cb,
.err_handler = ipcomp4_err,
- .no_policy = 1,
- .netns_ok = 1,
+ .priority = 0,
};
static int __init ipcomp4_init(void)
@@ -170,7 +178,7 @@ static int __init ipcomp4_init(void)
pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ipcomp_type, AF_INET);
return -EAGAIN;
@@ -180,7 +188,7 @@ static int __init ipcomp4_init(void)
static void __exit ipcomp4_fini(void)
{
- if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index efa1138fa52..b3e86ea7b71 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -273,7 +273,7 @@ static int __init ic_open_devs(void)
msleep(1);
- if time_before(jiffies, next_msg)
+ if (time_before(jiffies, next_msg))
continue;
elapsed = jiffies_to_msecs(jiffies - start);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7f80fb4b82d..62eaa005e14 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->dev->ifindex, 0, IPPROTO_IPIP, 0);
+ t->parms.link, 0, IPPROTO_IPIP, 0);
err = 0;
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
IPPROTO_IPIP, 0);
err = 0;
goto out;
@@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ if (IS_ERR(skb))
+ goto out;
ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
return NETDEV_TX_OK;
tx_error:
+ kfree_skb(skb);
+out:
dev->stats.tx_errors++;
- dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -275,6 +275,7 @@ static const struct net_device_ops ipip_netdev_ops = {
#define IPIP_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
NETIF_F_HW_CSUM)
static void ipip_tunnel_setup(struct net_device *dev)
@@ -485,4 +486,5 @@ static void __exit ipip_fini(void)
module_init(ipip_init);
module_exit(ipip_fini);
MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ipip");
MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9ae54b09254..65bcaa78904 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
- struct ipmr_result res;
- struct fib_lookup_arg arg = { .result = &res, };
int err;
+ struct ipmr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
err = fib_rules_lookup(net->ipv4.mr_rules_ops,
flowi4_to_flowi(flp4), 0, &arg);
@@ -425,6 +428,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
goto failure;
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
if (dev_open(dev))
@@ -451,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
struct mr_table *mrt;
struct flowi4 fl4 = {
.flowi4_oif = dev->ifindex,
- .flowi4_iif = skb->skb_iif,
+ .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
.flowi4_mark = skb->mark,
};
int err;
@@ -480,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->type = ARPHRD_PIMREG;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
- dev->netdev_ops = &reg_vif_netdev_ops,
+ dev->netdev_ops = &reg_vif_netdev_ops;
dev->destructor = free_netdev;
dev->features |= NETIF_F_NETNS_LOCAL;
}
@@ -517,6 +521,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
}
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
rcu_read_unlock();
@@ -1658,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
iph->tot_len = htons(skb->len);
- ip_select_ident(iph, skb_dst(skb), NULL);
+ ip_select_ident(skb, NULL);
ip_send_check(iph);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -2250,13 +2255,14 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
}
static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
- u32 portid, u32 seq, struct mfc_cache *c, int cmd)
+ u32 portid, u32 seq, struct mfc_cache *c, int cmd,
+ int flags)
{
struct nlmsghdr *nlh;
struct rtmsg *rtm;
int err;
- nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2324,7 +2330,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
if (skb == NULL)
goto errout;
- err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd);
+ err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
if (err < 0)
goto errout;
@@ -2363,7 +2369,8 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
if (ipmr_fill_mroute(mrt, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- mfc, RTM_NEWROUTE) < 0)
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
goto done;
next_entry:
e++;
@@ -2377,7 +2384,8 @@ next_entry:
if (ipmr_fill_mroute(mrt, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- mfc, RTM_NEWROUTE) < 0) {
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0) {
spin_unlock_bh(&mfc_unres_lock);
goto done;
}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3e0adea9c2..7ebd6e37875 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
skb_dst_set(skb, NULL);
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
if (IS_ERR(dst))
- return PTR_ERR(dst);;
+ return PTR_ERR(dst);
skb_dst_set(skb, dst);
}
#endif
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1657e39b291..a26ce035e3f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,42 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
+config NF_TABLES_IPV4
+ depends on NF_TABLES
+ tristate "IPv4 nf_tables support"
+ help
+ This option enables the IPv4 support for nf_tables.
+
+config NFT_CHAIN_ROUTE_IPV4
+ depends on NF_TABLES_IPV4
+ tristate "IPv4 nf_tables route chain support"
+ help
+ This option enables the "route" chain for IPv4 in nf_tables. This
+ chain type is used to force packet re-routing after mangling header
+ fields such as the source, destination, type of service and
+ the packet mark.
+
+config NFT_CHAIN_NAT_IPV4
+ depends on NF_TABLES_IPV4
+ depends on NF_NAT_IPV4 && NFT_NAT
+ tristate "IPv4 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv4 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NFT_REJECT_IPV4
+ depends on NF_TABLES_IPV4
+ default NFT_REJECT
+ tristate
+
+config NF_TABLES_ARP
+ depends on NF_TABLES
+ tristate "ARP nf_tables support"
+ help
+ This option enables the ARP support for nf_tables.
+
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3622b248b6d..90b82405331 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -27,6 +27,12 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
# NAT protocols (nf_nat)
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
+obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
+
# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 85a4f21aac1..f95b6f93814 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
@@ -1039,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
- sizeof(struct xt_counters) * num_counters) != 0)
- ret = -EFAULT;
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
xt_table_unlock(t);
return ret;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index a865f6f9401..802ddecb30b 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,14 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
-arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net = dev_net((in != NULL) ? in : out);
- return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
+ return arpt_do_table(skb, ops->hooknum, in, out,
+ net->ipv4.arptable_filter);
}
static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d23118d95ff..99e810f8467 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb,
addend = xt_write_recseq_begin();
private = table->private;
cpu = smp_processor_id();
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[cpu];
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
stackptr = per_cpu_ptr(private->stackptr, cpu);
@@ -1226,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
- sizeof(struct xt_counters) * num_counters) != 0)
- ret = -EFAULT;
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
xt_table_unlock(t);
return ret;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0b732efd32e..2510c02c2d2 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,6 +28,7 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/checksum.h>
#include <net/ip.h>
@@ -57,15 +58,21 @@ struct clusterip_config {
struct rcu_head rcu;
};
-static LIST_HEAD(clusterip_configs);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_SPINLOCK(clusterip_lock);
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+ struct list_head configs;
+ /* lock protects the configs list */
+ spinlock_t lock;
#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
+ struct proc_dir_entry *procdir;
#endif
+};
static inline void
clusterip_config_get(struct clusterip_config *c)
@@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
+ struct net *net = dev_net(c->dev);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
local_bh_disable();
- if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+ if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
list_del_rcu(&c->list);
- spin_unlock(&clusterip_lock);
+ spin_unlock(&cn->lock);
local_bh_enable();
dev_mc_del(c->dev, c->clustermac);
@@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)
}
static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
+__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- list_for_each_entry_rcu(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
- c = __clusterip_config_find(clusterip);
+ c = __clusterip_config_find(net, clusterip);
if (c) {
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
@@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
struct net_device *dev)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
@@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
- clusterip_procdir,
+ cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
kfree(c);
@@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- spin_lock_bh(&clusterip_lock);
- list_add_rcu(&c->list, &clusterip_configs);
- spin_unlock_bh(&clusterip_lock);
+ spin_lock_bh(&cn->lock);
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
return c;
}
@@ -370,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+ config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
pr_info("no config found for %pI4, need 'new'\n",
@@ -384,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- dev = dev_get_by_name(&init_net, e->ip.iniface);
+ dev = dev_get_by_name(par->net, e->ip.iniface);
if (!dev) {
pr_info("no such interface %s\n",
e->ip.iniface);
@@ -483,7 +495,7 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(unsigned int hook,
+arp_mangle(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -492,6 +504,7 @@ arp_mangle(unsigned int hook,
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
+ struct net *net = dev_net(in ? in : out);
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -508,7 +521,7 @@ arp_mangle(unsigned int hook,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip, 0);
+ c = clusterip_config_find_get(net, payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -698,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = {
#endif /* CONFIG_PROC_FS */
+static int clusterip_net_init(struct net *net)
+{
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ INIT_LIST_HEAD(&cn->configs);
+
+ spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+ cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+ if (!cn->procdir) {
+ pr_err("Unable to proc dir entry\n");
+ return -ENOMEM;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+ .init = clusterip_net_init,
+ .exit = clusterip_net_exit,
+ .id = &clusterip_net_id,
+ .size = sizeof(struct clusterip_net),
+};
+
static int __init clusterip_tg_init(void)
{
int ret;
- ret = xt_register_target(&clusterip_tg_reg);
+ ret = register_pernet_subsys(&clusterip_net_ops);
if (ret < 0)
return ret;
+ ret = xt_register_target(&clusterip_tg_reg);
+ if (ret < 0)
+ goto cleanup_subsys;
+
ret = nf_register_hook(&cip_arp_ops);
if (ret < 0)
goto cleanup_target;
-#ifdef CONFIG_PROC_FS
- clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
- if (!clusterip_procdir) {
- pr_err("Unable to proc dir entry\n");
- ret = -ENOMEM;
- goto cleanup_hook;
- }
-#endif /* CONFIG_PROC_FS */
-
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
+
return 0;
-#ifdef CONFIG_PROC_FS
-cleanup_hook:
- nf_unregister_hook(&cip_arp_ops);
-#endif /* CONFIG_PROC_FS */
cleanup_target:
xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+ unregister_pernet_subsys(&clusterip_net_ops);
return ret;
}
static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
- proc_remove(clusterip_procdir);
-#endif
+
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+ unregister_pernet_subsys(&clusterip_net_ops);
/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
rcu_barrier_bh();
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b969131ad1c..5b6e0df4ccf 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -17,10 +17,6 @@
#include <linux/udp.h>
#include <linux/icmp.h>
#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/route.h>
-#include <net/dst.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_REJECT.h>
@@ -28,128 +24,12 @@
#include <linux/netfilter_bridge.h>
#endif
+#include <net/netfilter/ipv4/nf_reject.h>
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
-/* Send RST reply */
-static void send_reset(struct sk_buff *oldskb, int hook)
-{
- struct sk_buff *nskb;
- const struct iphdr *oiph;
- struct iphdr *niph;
- const struct tcphdr *oth;
- struct tcphdr _otcph, *tcph;
-
- /* IP header checks: fragment. */
- if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
- return;
-
- oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
- sizeof(_otcph), &_otcph);
- if (oth == NULL)
- return;
-
- /* No RST for RST. */
- if (oth->rst)
- return;
-
- if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
- return;
-
- /* Check checksum */
- if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
- return;
- oiph = ip_hdr(oldskb);
-
- nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
- LL_MAX_HEADER, GFP_ATOMIC);
- if (!nskb)
- return;
-
- skb_reserve(nskb, LL_MAX_HEADER);
-
- skb_reset_network_header(nskb);
- niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
- niph->version = 4;
- niph->ihl = sizeof(struct iphdr) / 4;
- niph->tos = 0;
- niph->id = 0;
- niph->frag_off = htons(IP_DF);
- niph->protocol = IPPROTO_TCP;
- niph->check = 0;
- niph->saddr = oiph->daddr;
- niph->daddr = oiph->saddr;
-
- skb_reset_transport_header(nskb);
- tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
- memset(tcph, 0, sizeof(*tcph));
- tcph->source = oth->dest;
- tcph->dest = oth->source;
- tcph->doff = sizeof(struct tcphdr) / 4;
-
- if (oth->ack)
- tcph->seq = oth->ack_seq;
- else {
- tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
- oldskb->len - ip_hdrlen(oldskb) -
- (oth->doff << 2));
- tcph->ack = 1;
- }
-
- tcph->rst = 1;
- tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
- niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)tcph - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- /* ip_route_me_harder expects skb->dst to be set */
- skb_dst_set_noref(nskb, skb_dst(oldskb));
-
- nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
- goto free_nskb;
-
- niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
-
- /* "Never happens" */
- if (nskb->len > dst_mtu(skb_dst(nskb)))
- goto free_nskb;
-
- nf_ct_attach(nskb, oldskb);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
- /* If we use ip_local_out for bridged traffic, the MAC source on
- * the RST will be ours, instead of the destination's. This confuses
- * some routers/firewalls, and they drop the packet. So we need to
- * build the eth header using the original destination's MAC as the
- * source, and send the RST packet directly.
- */
- if (oldskb->nf_bridge) {
- struct ethhdr *oeth = eth_hdr(oldskb);
- nskb->dev = oldskb->nf_bridge->physindev;
- niph->tot_len = htons(nskb->len);
- ip_send_check(niph);
- if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
- oeth->h_source, oeth->h_dest, nskb->len) < 0)
- goto free_nskb;
- dev_queue_xmit(nskb);
- } else
-#endif
- ip_local_out(nskb);
-
- return;
-
- free_nskb:
- kfree_skb(nskb);
-}
-
-static inline void send_unreach(struct sk_buff *skb_in, int code)
-{
- icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
-}
-
static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -157,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
- send_unreach(skb, ICMP_NET_UNREACH);
+ nf_send_unreach(skb, ICMP_NET_UNREACH);
break;
case IPT_ICMP_HOST_UNREACHABLE:
- send_unreach(skb, ICMP_HOST_UNREACH);
+ nf_send_unreach(skb, ICMP_HOST_UNREACH);
break;
case IPT_ICMP_PROT_UNREACHABLE:
- send_unreach(skb, ICMP_PROT_UNREACH);
+ nf_send_unreach(skb, ICMP_PROT_UNREACH);
break;
case IPT_ICMP_PORT_UNREACHABLE:
- send_unreach(skb, ICMP_PORT_UNREACH);
+ nf_send_unreach(skb, ICMP_PORT_UNREACH);
break;
case IPT_ICMP_NET_PROHIBITED:
- send_unreach(skb, ICMP_NET_ANO);
+ nf_send_unreach(skb, ICMP_NET_ANO);
break;
case IPT_ICMP_HOST_PROHIBITED:
- send_unreach(skb, ICMP_HOST_ANO);
+ nf_send_unreach(skb, ICMP_HOST_ANO);
break;
case IPT_ICMP_ADMIN_PROHIBITED:
- send_unreach(skb, ICMP_PKT_FILTERED);
+ nf_send_unreach(skb, ICMP_PKT_FILTERED);
break;
case IPT_TCP_RESET:
- send_reset(skb, par->hooknum);
+ nf_send_reset(skb, par->hooknum);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 67e17dcda65..a313c3fbeb4 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -244,6 +244,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
this_cpu_inc(snet->stats->cookie_valid);
opts->mss = mss;
+ opts->options |= XT_SYNPROXY_OPT_MSS;
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
@@ -267,7 +268,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
if (th == NULL)
return NF_DROP;
- synproxy_parse_options(skb, par->thoff, th, &opts);
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
if (th->syn && !(th->ack || th->fin || th->rst)) {
/* Initial SYN from client */
@@ -296,7 +298,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
+static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -350,7 +352,8 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
/* fall through */
case TCP_CONNTRACK_SYN_SENT:
- synproxy_parse_options(skb, thoff, th, &opts);
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
if (!th->syn && th->ack &&
CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
@@ -373,7 +376,9 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
if (!th->syn || !th->ack)
break;
- synproxy_parse_options(skb, thoff, th, &opts);
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy->tsoff = opts.tsval - synproxy->its;
@@ -418,6 +423,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
static struct xt_target synproxy_tg4_reg __read_mostly = {
.name = "SYNPROXY",
.family = NFPROTO_IPV4,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
.target = synproxy_tg4,
.targetsize = sizeof(struct xt_synproxy_info),
.checkentry = synproxy_tg4_check,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index cbc22158af4..9cb993cd224 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net,
ub->qlen++;
pm = nlmsg_data(nlh);
+ memset(pm, 0, sizeof(*pm));
/* We might not have a timestamp, get one */
if (skb->tstamp.tv64 == 0)
@@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net,
}
else if (loginfo->prefix[0] != '\0')
strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
- else
- *(pm->prefix) = '\0';
if (in && in->hard_header_len > 0 &&
skb->mac_header != skb->network_header &&
@@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net,
if (in)
strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
- else
- pm->indev_name[0] = '\0';
if (out)
strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
- else
- pm->outdev_name[0] = '\0';
/* copy_len <= skb->len, so can't fail. */
if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index c49dcd0284a..4bfaedf9b34 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -89,11 +89,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ipv4_is_multicast(iph->daddr)) {
if (ipv4_is_zeronet(iph->saddr))
return ipv4_is_local_multicast(iph->daddr) ^ invert;
- flow.flowi4_iif = 0;
- } else {
- flow.flowi4_iif = LOOPBACK_IFINDEX;
}
-
+ flow.flowi4_iif = LOOPBACK_IFINDEX;
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_oif = 0;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 50af5b45c05..e08a74a243a 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -33,20 +33,21 @@ static const struct xt_table packet_filter = {
};
static unsigned int
-iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (hook == NF_INET_LOCAL_OUT &&
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
+ return ipt_do_table(skb, ops->hooknum, in, out,
+ net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 0d8cd82e0fa..6a5079c34bb 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_mangle_hook(unsigned int hook,
+iptable_mangle_hook(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (hook == NF_INET_LOCAL_OUT)
+ if (ops->hooknum == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, out);
- if (hook == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, hook, in, out,
+ if (ops->hooknum == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, ops->hooknum, in, out,
dev_net(out)->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, hook, in, out,
+ return ipt_do_table(skb, ops->hooknum, in, out,
dev_net(in)->ipv4.iptable_mangle);
}
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 683bfaffed6..f1787c04a4d 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
}
static unsigned int
-nf_nat_ipv4_fn(unsigned int hooknum,
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -71,7 +71,7 @@ nf_nat_ipv4_fn(unsigned int hooknum,
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
/* We never see fragments: conntrack defrags on pre-routing
* and local-out, and nf_nat_out protects post-routing.
@@ -91,24 +91,16 @@ nf_nat_ipv4_fn(unsigned int hooknum,
if (nf_ct_is_untracked(ct))
return NF_ACCEPT;
- nat = nfct_nat(ct);
- if (!nat) {
- /* NAT module was loaded late. */
- if (nf_ct_is_confirmed(ct))
- return NF_ACCEPT;
- nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
- if (nat == NULL) {
- pr_debug("failed to add NAT extension\n");
- return NF_ACCEPT;
- }
- }
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- hooknum))
+ ops->hooknum))
return NF_DROP;
else
return NF_ACCEPT;
@@ -121,14 +113,14 @@ nf_nat_ipv4_fn(unsigned int hooknum,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+ ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
goto oif_changed;
}
break;
@@ -137,11 +129,11 @@ nf_nat_ipv4_fn(unsigned int hooknum,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, hooknum, skb);
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
@@ -149,7 +141,7 @@ oif_changed:
}
static unsigned int
-nf_nat_ipv4_in(unsigned int hooknum,
+nf_nat_ipv4_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -158,7 +150,7 @@ nf_nat_ipv4_in(unsigned int hooknum,
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
@@ -167,7 +159,7 @@ nf_nat_ipv4_in(unsigned int hooknum,
}
static unsigned int
-nf_nat_ipv4_out(unsigned int hooknum,
+nf_nat_ipv4_out(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -185,7 +177,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -207,7 +199,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
}
static unsigned int
-nf_nat_ipv4_local_fn(unsigned int hooknum,
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -223,7 +215,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1f82aea11df..b2f7e8f9831 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
+iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (hook == NF_INET_LOCAL_OUT &&
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
+ return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index f867a8d38bf..c86647ed207 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -37,21 +37,22 @@ static const struct xt_table security_table = {
};
static unsigned int
-iptable_security_hook(unsigned int hook, struct sk_buff *skb,
+iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (hook == NF_INET_LOCAL_OUT &&
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
+ return ipt_do_table(skb, ops->hooknum, in, out,
+ net->ipv4.iptable_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 86f5b34a4ed..8127dc80286 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
return NF_ACCEPT;
}
-static unsigned int ipv4_helper(unsigned int hooknum,
+static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -121,7 +121,7 @@ static unsigned int ipv4_helper(unsigned int hooknum,
ct, ctinfo);
}
-static unsigned int ipv4_confirm(unsigned int hooknum,
+static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -147,16 +147,16 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv4_conntrack_in(unsigned int hooknum,
+static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
+ return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
}
-static unsigned int ipv4_conntrack_local(unsigned int hooknum,
+static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
+ return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
}
/* Connection tracking may drop packets, but never alters them, so
@@ -548,9 +548,3 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
module_init(nf_conntrack_l3proto_ipv4_init);
module_exit(nf_conntrack_l3proto_ipv4_fini);
-
-void need_ipv4_conntrack(void)
-{
- return;
-}
-EXPORT_SYMBOL_GPL(need_ipv4_conntrack);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 742815518b0..b8f6381c7d0 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,7 +22,6 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-/* Returns new sk_buff, or NULL */
static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
{
int err;
@@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
err = ip_defrag(skb, user);
local_bh_enable();
- if (!err)
+ if (!err) {
ip_send_check(ip_hdr(skb));
+ skb->ignore_df = 1;
+ }
return err;
}
@@ -60,7 +61,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
return IP_DEFRAG_CONNTRACK_OUT + zone;
}
-static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -83,7 +84,9 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
#endif
/* Gather fragments. */
if (ip_is_fragment(ip_hdr(skb))) {
- enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
+ enum ip_defrag_users user =
+ nf_ct_defrag_user(ops->hooknum, skb);
+
if (nf_ct_ipv4_gather_frags(skb, user))
return NF_STOLEN;
}
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 9eea059dd62..574f7ebba0b 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -229,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
ret = nf_ct_expect_related(rtcp_exp);
if (ret == 0)
break;
- else if (ret != -EBUSY) {
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
nf_ct_unexpect_related(rtp_exp);
nated_port = 0;
break;
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 5f011cc89cd..7c676671329 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -34,8 +34,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
* Author: James Morris <jmorris@intercode.com.au>
*
@@ -462,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
}
if (subid < 40) {
- optr [0] = 0;
- optr [1] = subid;
+ optr[0] = 0;
+ optr[1] = subid;
} else if (subid < 80) {
- optr [0] = 1;
- optr [1] = subid - 40;
+ optr[0] = 1;
+ optr[1] = subid - 40;
} else {
- optr [0] = 2;
- optr [1] = subid - 80;
+ optr[0] = 2;
+ optr[1] = subid - 80;
}
*len = 2;
@@ -1199,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct,
map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
} else {
/* DNAT replies */
- map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
- map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
+ map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip);
+ map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);
}
if (map.from == map.to)
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
new file mode 100644
index 00000000000..19412a4063f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter_arp.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int
+nft_do_chain_arp(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, ops, skb, in, out);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static struct nft_af_info nft_af_arp __read_mostly = {
+ .family = NFPROTO_ARP,
+ .nhooks = NF_ARP_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_ARP_IN] = nft_do_chain_arp,
+ [NF_ARP_OUT] = nft_do_chain_arp,
+ [NF_ARP_FORWARD] = nft_do_chain_arp,
+ },
+};
+
+static int nf_tables_arp_init_net(struct net *net)
+{
+ net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.arp== NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
+
+ if (nft_register_afinfo(net, net->nft.arp) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.arp);
+ return -ENOMEM;
+}
+
+static void nf_tables_arp_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.arp);
+ kfree(net->nft.arp);
+}
+
+static struct pernet_operations nf_tables_arp_net_ops = {
+ .init = nf_tables_arp_init_net,
+ .exit = nf_tables_arp_exit_net,
+};
+
+static const struct nf_chain_type filter_arp = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_ARP,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_ARP_IN) |
+ (1 << NF_ARP_OUT) |
+ (1 << NF_ARP_FORWARD),
+};
+
+static int __init nf_tables_arp_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_arp);
+ ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_arp);
+
+ return ret;
+}
+
+static void __exit nf_tables_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_arp_net_ops);
+ nft_unregister_chain_type(&filter_arp);
+}
+
+module_init(nf_tables_arp_init);
+module_exit(nf_tables_arp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
new file mode 100644
index 00000000000..6820c8c4084
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+
+static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ if (unlikely(skb->len < sizeof(struct iphdr) ||
+ ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+ if (net_ratelimit())
+ pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
+ "packet\n");
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain_ipv4(ops, skb, in, out, okfn);
+}
+
+struct nft_af_info nft_af_ipv4 __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
+ [NF_INET_LOCAL_OUT] = nft_ipv4_output,
+ [NF_INET_FORWARD] = nft_do_chain_ipv4,
+ [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
+ [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
+ },
+};
+EXPORT_SYMBOL_GPL(nft_af_ipv4);
+
+static int nf_tables_ipv4_init_net(struct net *net)
+{
+ net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.ipv4 == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
+
+ if (nft_register_afinfo(net, net->nft.ipv4) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.ipv4);
+ return -ENOMEM;
+}
+
+static void nf_tables_ipv4_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.ipv4);
+ kfree(net->nft.ipv4);
+}
+
+static struct pernet_operations nf_tables_ipv4_net_ops = {
+ .init = nf_tables_ipv4_init_net,
+ .exit = nf_tables_ipv4_exit_net,
+};
+
+static const struct nf_chain_type filter_ipv4 = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_ipv4_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_ipv4);
+ ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_ipv4);
+
+ return ret;
+}
+
+static void __exit nf_tables_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
+ nft_unregister_chain_type(&filter_ipv4);
+}
+
+module_init(nf_tables_ipv4_init);
+module_exit(nf_tables_ipv4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
new file mode 100644
index 00000000000..3964157d826
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+/*
+ * NAT chains
+ */
+
+static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn_nat *nat;
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ struct nft_pktinfo pkt;
+ unsigned int ret;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED + IP_CT_IS_REPLY:
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ ops->hooknum))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall through */
+ case IP_CT_NEW:
+ if (nf_nat_initialized(ct, maniptype))
+ break;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_ACCEPT)
+ return ret;
+ if (!nf_nat_initialized(ct, maniptype)) {
+ ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ if (ret != NF_ACCEPT)
+ return ret;
+ }
+ default:
+ break;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ __be32 daddr = ip_hdr(skb)->daddr;
+ unsigned int ret;
+
+ ret = nf_nat_fn(ops, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ ip_hdr(skb)->daddr != daddr) {
+ skb_dst_drop(skb);
+ }
+ return ret;
+}
+
+static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ enum ip_conntrack_info ctinfo __maybe_unused;
+ const struct nf_conn *ct __maybe_unused;
+ unsigned int ret;
+
+ ret = nf_nat_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip ||
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)
+ return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
+ ret : NF_DROP;
+ }
+#endif
+ return ret;
+}
+
+static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int ret;
+
+ ret = nf_nat_fn(ops, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ if (ip_route_me_harder(skb, RTN_UNSPEC))
+ ret = NF_DROP;
+ }
+#ifdef CONFIG_XFRM
+ else if (ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all)
+ if (nf_xfrm_me_harder(skb, AF_INET))
+ ret = NF_DROP;
+#endif
+ }
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_nat_ipv4 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nf_nat_prerouting,
+ [NF_INET_POST_ROUTING] = nf_nat_postrouting,
+ [NF_INET_LOCAL_OUT] = nf_nat_output,
+ [NF_INET_LOCAL_IN] = nf_nat_fn,
+ },
+};
+
+static int __init nft_chain_nat_init(void)
+{
+ int err;
+
+ err = nft_register_chain_type(&nft_chain_nat_ipv4);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_nat_ipv4);
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
new file mode 100644
index 00000000000..125b66766c0
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int ret;
+ struct nft_pktinfo pkt;
+ u32 mark;
+ __be32 saddr, daddr;
+ u_int8_t tos;
+ const struct iphdr *iph;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_DROP && ret != NF_QUEUE) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos)
+ if (ip_route_me_harder(skb, RTN_UNSPEC))
+ ret = NF_DROP;
+ }
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_route_ipv4 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook,
+ },
+};
+
+static int __init nft_chain_route_init(void)
+{
+ return nft_register_chain_type(&nft_chain_route_ipv4);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_route_ipv4);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 00000000000..e79718a382f
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/icmp.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/nft_reject.h>
+
+void nft_reject_ipv4_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach(pkt->skb, priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ break;
+ }
+
+ data[NFT_REG_VERDICT].verdict = NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval);
+
+static struct nft_expr_type nft_reject_ipv4_type;
+static const struct nft_expr_ops nft_reject_ipv4_ops = {
+ .type = &nft_reject_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv4_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "reject",
+ .ops = &nft_reject_ipv4_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv4_type);
+}
+
+static void __exit nft_reject_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv4_type);
+}
+
+module_init(nft_reject_ipv4_module_init);
+module_exit(nft_reject_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index d7d9882d4ca..044a0ddf6a7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -53,8 +53,12 @@
#include <net/transp_v6.h>
#endif
+struct ping_table {
+ struct hlist_nulls_head hash[PING_HTABLE_SIZE];
+ rwlock_t lock;
+};
-struct ping_table ping_table;
+static struct ping_table ping_table;
struct pingv6_ops pingv6_ops;
EXPORT_SYMBOL_GPL(pingv6_ops);
@@ -202,15 +206,14 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
#if IS_ENABLED(CONFIG_IPV6)
} else if (skb->protocol == htons(ETH_P_IPV6) &&
sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
(int) isk->inet_num,
- &inet6_sk(sk)->rcv_saddr,
+ &sk->sk_v6_rcv_saddr,
sk->sk_bound_dev_if);
- if (!ipv6_addr_any(&np->rcv_saddr) &&
- !ipv6_addr_equal(&np->rcv_saddr,
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
&ipv6_hdr(skb)->daddr))
continue;
#endif
@@ -233,15 +236,15 @@ exit:
static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
kgid_t *high)
{
- kgid_t *data = net->ipv4.sysctl_ping_group_range;
+ kgid_t *data = net->ipv4.ping_group_range.range;
unsigned int seq;
do {
- seq = read_seqbegin(&sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&sysctl_local_ports.lock, seq));
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
}
@@ -249,26 +252,33 @@ int ping_init_sock(struct sock *sk)
{
struct net *net = sock_net(sk);
kgid_t group = current_egid();
- struct group_info *group_info = get_current_groups();
- int i, j, count = group_info->ngroups;
+ struct group_info *group_info;
+ int i, j, count;
kgid_t low, high;
+ int ret = 0;
inet_get_ping_group_range_net(net, &low, &high);
if (gid_lte(low, group) && gid_lte(group, high))
return 0;
+ group_info = get_current_groups();
+ count = group_info->ngroups;
for (i = 0; i < group_info->nblocks; i++) {
int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
for (j = 0; j < cp_count; j++) {
kgid_t gid = group_info->blocks[i][j];
if (gid_lte(low, gid) && gid_lte(gid, high))
- return 0;
+ goto out_release_group;
}
count -= cp_count;
}
- return -EACCES;
+ ret = -EACCES;
+
+out_release_group:
+ put_group_info(group_info);
+ return ret;
}
EXPORT_SYMBOL_GPL(ping_init_sock);
@@ -317,6 +327,9 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
if (addr_len < sizeof(*addr))
return -EINVAL;
+ if (addr->sin6_family != AF_INET6)
+ return -EINVAL;
+
pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
@@ -362,7 +375,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
} else if (saddr->sa_family == AF_INET6) {
struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
struct ipv6_pinfo *np = inet6_sk(sk);
- np->rcv_saddr = np->saddr = addr->sin6_addr;
+ sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
#endif
}
}
@@ -376,7 +389,7 @@ static void ping_clear_saddr(struct sock *sk, int dif)
#if IS_ENABLED(CONFIG_IPV6)
} else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr));
+ memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
memset(&np->saddr, 0, sizeof(np->saddr));
#endif
}
@@ -416,10 +429,12 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
(int)sk->sk_bound_dev_if);
err = 0;
- if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) ||
- (sk->sk_family == AF_INET6 &&
- !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)))
+ if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
@@ -429,7 +444,7 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
- memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr));
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
#endif
sk_dst_reset(sk);
@@ -667,8 +682,8 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
}
EXPORT_SYMBOL_GPL(ping_common_sendmsg);
-int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
{
struct net *net = sock_net(sk);
struct flowi4 fl4;
@@ -695,7 +710,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
*/
if (msg->msg_name) {
- struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET)
@@ -713,11 +728,13 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.oif = sk->sk_bound_dev_if;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
sock_tx_timestamp(sk, &ipc.tx_flags);
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
if (err)
return err;
if (ipc.opt)
@@ -744,7 +761,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
return -EINVAL;
faddr = ipc.opt->opt.faddr;
}
- tos = RT_TOS(inet->tos);
+ tos = get_rttos(&ipc, inet);
if (sock_flag(sk, SOCK_LOCALROUTE) ||
(msg->msg_flags & MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->opt.is_strictroute)) {
@@ -769,7 +786,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -827,8 +844,6 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
{
struct inet_sock *isk = inet_sk(sk);
int family = sk->sk_family;
- struct sockaddr_in *sin;
- struct sockaddr_in6 *sin6;
struct sk_buff *skb;
int copied, err;
@@ -838,19 +853,13 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto out;
- if (addr_len) {
- if (family == AF_INET)
- *addr_len = sizeof(*sin);
- else if (family == AF_INET6 && addr_len)
- *addr_len = sizeof(*sin6);
- }
-
if (flags & MSG_ERRQUEUE) {
if (family == AF_INET) {
- return ip_recv_error(sk, msg, len);
+ return ip_recv_error(sk, msg, len, addr_len);
#if IS_ENABLED(CONFIG_IPV6)
} else if (family == AF_INET6) {
- return pingv6_ops.ipv6_recv_error(sk, msg, len);
+ return pingv6_ops.ipv6_recv_error(sk, msg, len,
+ addr_len);
#endif
}
}
@@ -874,11 +883,15 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Copy the address and add cmsg data. */
if (family == AF_INET) {
- sin = (struct sockaddr_in *) msg->msg_name;
- sin->sin_family = AF_INET;
- sin->sin_port = 0 /* skb->h.uh->source */;
- sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
if (isk->cmsg_flags)
ip_cmsg_recv(msg, skb);
@@ -887,20 +900,28 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
} else if (family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *ip6 = ipv6_hdr(skb);
- sin6 = (struct sockaddr_in6 *) msg->msg_name;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = 0;
- sin6->sin6_addr = ip6->saddr;
-
- sin6->sin6_flowinfo = 0;
- if (np->sndflow)
- sin6->sin6_flowinfo = ip6_flowinfo(ip6);
-
- sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
- IP6CB(skb)->iif);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ if (sin6) {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+ sin6->sin6_flowinfo = 0;
+ if (np->sndflow)
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ IP6CB(skb)->iif);
+ *addr_len = sizeof(*sin6);
+ }
if (inet6_sk(sk)->rxopt.all)
- pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb);
+ pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
#endif
} else {
BUG();
@@ -1073,7 +1094,7 @@ void ping_seq_stop(struct seq_file *seq, void *v)
EXPORT_SYMBOL_GPL(ping_seq_stop);
static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
- int bucket, int *len)
+ int bucket)
{
struct inet_sock *inet = inet_sk(sp);
__be32 dest = inet->inet_daddr;
@@ -1082,7 +1103,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
@@ -1090,23 +1111,22 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
0, sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops), len);
+ atomic_read(&sp->sk_drops));
}
static int ping_v4_seq_show(struct seq_file *seq, void *v)
{
+ seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-127s\n",
- " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
struct ping_iter_state *state = seq->private;
- int len;
- ping_v4_format_sock(v, seq, state->bucket, &len);
- seq_printf(seq, "%*s\n", 127 - len, "");
+ ping_v4_format_sock(v, seq, state->bucket);
}
+ seq_pad(seq, '\n');
return 0;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4a0335854b8..ae0af9386f7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -273,12 +273,19 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+ SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
+ SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+ SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+ SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
SNMP_MIB_SENTINEL
};
@@ -332,22 +339,22 @@ static void icmp_put(struct seq_file *seq)
atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
seq_printf(seq, " OutMsgs OutErrors");
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu %lu",
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + icmpmibmap[i].index));
seq_printf(seq, " %lu %lu",
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
}
@@ -372,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
seq_printf(seq, " %llu",
- snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipstats_list[i].entry,
offsetof(struct ipstats_mib, syncp)));
@@ -388,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld",
- snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+ snmp_fold_field(net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
else
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+ snmp_fold_field(net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
}
@@ -403,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdp:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.udp_statistics,
+ snmp_fold_field(net->mib.udp_statistics,
snmp4_udp_list[i].entry));
/* the UDP and UDP-Lite MIBs are the same */
@@ -414,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdpLite:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
+ snmp_fold_field(net->mib.udplite_statistics,
snmp4_udp_list[i].entry));
seq_putc(seq, '\n');
@@ -451,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nTcpExt:");
for (i = 0; snmp4_net_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.net_statistics,
+ snmp_fold_field(net->mib.net_statistics,
snmp4_net_list[i].entry));
seq_puts(seq, "\nIpExt:");
@@ -461,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nIpExt:");
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
seq_printf(seq, " %llu",
- snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipextstats_list[i].entry,
offsetof(struct ipstats_mib, syncp)));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index ce848461acb..46d6a1c923a 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -31,10 +31,6 @@
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
-/*
- * Add a protocol handler to the hash tables
- */
-
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
if (!prot->netns_ok) {
@@ -55,10 +51,6 @@ int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
}
EXPORT_SYMBOL(inet_add_offload);
-/*
- * Remove a protocol from the hash tables.
- */
-
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
int ret;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a86c7ae7188..2c65160565e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -218,8 +218,10 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_sk_update_pmtu(skb, sk, info);
- else if (type == ICMP_REDIRECT)
+ else if (type == ICMP_REDIRECT) {
ipv4_sk_redirect(skb, sk);
+ return;
+ }
/* Report error on raw socket, if:
1. User requested ip_recverr.
@@ -297,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Charge it to the socket. */
- ipv4_pktinfo_prepare(skb);
+ ipv4_pktinfo_prepare(sk, skb);
if (sock_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
return NET_RX_DROP;
@@ -387,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(iph, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
@@ -491,7 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
*/
if (msg->msg_namelen) {
- struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
err = -EINVAL;
if (msg->msg_namelen < sizeof(*usin))
goto out;
@@ -517,10 +519,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
if (err)
goto out;
if (ipc.opt)
@@ -556,7 +560,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
daddr = ipc.opt->opt.faddr;
}
}
- tos = RT_CONN_FLAGS(sk);
+ tos = get_rtconn_flags(&ipc, sk);
if (msg->msg_flags & MSG_DONTROUTE)
tos |= RTO_ONLINK;
@@ -571,7 +575,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
- inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP |
+ inet_sk_flowi_flags(sk) |
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
@@ -686,17 +690,14 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
int err = -EOPNOTSUPP;
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
if (flags & MSG_OOB)
goto out;
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE) {
- err = ip_recv_error(sk, msg, len);
+ err = ip_recv_error(sk, msg, len, addr_len);
goto out;
}
@@ -722,6 +723,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 727f4365bcd..190199851c9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -89,6 +89,7 @@
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <linux/jhash.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
@@ -112,9 +113,6 @@
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
-/* IPv4 datagram length is stored into 16bit field (tot_len) */
-#define IP_MAX_MTU 0xFFFF
-
#define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_max_size;
@@ -142,11 +140,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void ipv4_dst_destroy(struct dst_entry *dst);
-static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int how)
-{
-}
-
static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
WARN_ON(1);
@@ -165,7 +158,6 @@ static struct dst_ops ipv4_dst_ops = {
.mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
- .ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
@@ -197,7 +189,7 @@ const __u8 ip_tos2prio[16] = {
EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -295,7 +287,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
dst_entries_get_slow(&ipv4_dst_ops),
- st->in_hit,
+ 0, /* st->in_hit */
st->in_slow_tot,
st->in_slow_mc,
st->in_no_route,
@@ -303,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
st->in_martian_dst,
st->in_martian_src,
- st->out_hit,
+ 0, /* st->out_hit */
st->out_slow_tot,
st->out_slow_mc,
- st->gc_total,
- st->gc_ignored,
- st->gc_goal_miss,
- st->gc_dst_overflow,
- st->in_hlist_search,
- st->out_hlist_search
+ 0, /* st->gc_total */
+ 0, /* st->gc_ignored */
+ 0, /* st->gc_goal_miss */
+ 0, /* st->gc_dst_overflow */
+ 0, /* st->in_hlist_search */
+ 0 /* st->out_hlist_search */
);
return 0;
}
@@ -465,39 +457,45 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}
-/*
- * Peer allocation may fail only in serious out-of-memory conditions. However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+ atomic_t id;
+ u32 stamp32;
+};
+
+static struct ip_ident_bucket *ip_idents __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
*/
-static void ip_select_fb_ident(struct iphdr *iph)
+u32 ip_idents_reserve(u32 hash, int segs)
{
- static DEFINE_SPINLOCK(ip_fb_id_lock);
- static u32 ip_fallback_id;
- u32 salt;
+ struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 now = (u32)jiffies;
+ u32 delta = 0;
- spin_lock_bh(&ip_fb_id_lock);
- salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
- iph->id = htons(salt & 0xFFFF);
- ip_fallback_id = salt;
- spin_unlock_bh(&ip_fb_id_lock);
+ if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ delta = prandom_u32_max(now - old);
+
+ return atomic_add_return(segs + delta, &bucket->id) - segs;
}
+EXPORT_SYMBOL(ip_idents_reserve);
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+void __ip_select_ident(struct iphdr *iph, int segs)
{
- struct net *net = dev_net(dst->dev);
- struct inet_peer *peer;
+ static u32 ip_idents_hashrnd __read_mostly;
+ u32 hash, id;
- peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
- if (peer) {
- iph->id = htons(inet_getid(peer, more));
- inet_putpeer(peer);
- return;
- }
+ net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
- ip_select_fb_ident(iph);
+ hash = jhash_3words((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+ iph->protocol,
+ ip_idents_hashrnd);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);
@@ -700,7 +698,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
out_unlock:
spin_unlock_bh(&fnhe_lock);
- return;
}
static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
@@ -1003,6 +1000,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
struct flowi4 fl4;
struct rtable *rt;
+ if (!mark)
+ mark = IP4_REPLY_MARK(net, skb->mark);
+
__build_flow_key(&fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
@@ -1020,6 +1020,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!fl4.flowi4_mark)
+ fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
rt = __ip_route_output_key(sock_net(sk), &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1032,20 +1036,25 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
- struct dst_entry *dst;
+ struct dst_entry *odst = NULL;
bool new = false;
bh_lock_sock(sk);
- rt = (struct rtable *) __sk_dst_get(sk);
- if (sock_owned_by_user(sk) || !rt) {
+ if (!ip_sk_accept_pmtu(sk))
+ goto out;
+
+ odst = sk_dst_get(sk);
+
+ if (sock_owned_by_user(sk) || !odst) {
__ipv4_sk_update_pmtu(skb, sk, mtu);
goto out;
}
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- if (!__sk_dst_check(sk, 0)) {
+ rt = (struct rtable *)odst;
+ if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
if (IS_ERR(rt))
goto out;
@@ -1055,8 +1064,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
- dst = dst_check(&rt->dst, 0);
- if (!dst) {
+ if (!dst_check(&rt->dst, 0)) {
if (new)
dst_release(&rt->dst);
@@ -1068,10 +1076,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
}
if (new)
- __sk_dst_set(sk, &rt->dst);
+ sk_dst_set(sk, &rt->dst);
out:
bh_unlock_sock(sk);
+ dst_release(odst);
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
@@ -1135,7 +1144,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sk_buff *skb)
+static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
{
pr_debug("%s: %pI4 -> %pI4, %s\n",
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1525,7 +1534,7 @@ static int __mkroute_input(struct sk_buff *skb,
struct in_device *out_dev;
unsigned int flags = 0;
bool do_cache;
- u32 itag;
+ u32 itag = 0;
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
@@ -1596,6 +1605,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
@@ -1694,25 +1704,27 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.daddr = daddr;
fl4.saddr = saddr;
err = fib_lookup(net, &fl4, &res);
- if (err != 0)
+ if (err != 0) {
+ if (!IN_DEV_FORWARD(in_dev))
+ err = -EHOSTUNREACH;
goto no_route;
-
- RT_CACHE_STAT_INC(in_slow_tot);
+ }
if (res.type == RTN_BROADCAST)
goto brd_input;
if (res.type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
- LOOPBACK_IFINDEX,
- dev, in_dev, &itag);
+ 0, dev, in_dev, &itag);
if (err < 0)
goto martian_source_keep_err;
goto local_input;
}
- if (!IN_DEV_FORWARD(in_dev))
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
goto no_route;
+ }
if (res.type != RTN_UNICAST)
goto martian_destination;
@@ -1767,13 +1779,18 @@ local_input:
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
- if (do_cache)
- rt_cache_route(&FIB_RES_NH(res), rth);
+ if (do_cache) {
+ if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ rth->dst.flags |= DST_NOCACHE;
+ rt_add_uncached_list(rth);
+ }
+ }
skb_dst_set(skb, &rth->dst);
err = 0;
goto out;
@@ -2072,7 +2089,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
RT_SCOPE_LINK);
goto make_route;
}
- if (fl4->saddr) {
+ if (!fl4->saddr) {
if (ipv4_is_multicast(fl4->daddr))
fl4->saddr = inet_select_addr(dev_out, 0,
fl4->flowi4_scope);
@@ -2215,7 +2232,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard;
+ new->output = dst_discard_sk;
new->dev = ort->dst.dev;
if (new->dev)
@@ -2354,7 +2371,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
}
} else
#endif
- if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+ if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
goto nla_put_failure;
}
@@ -2465,11 +2482,6 @@ errout_free:
goto errout;
}
-int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return skb->len;
-}
-
void ip_rt_multicast_event(struct in_device *in_dev)
{
rt_cache_flush(dev_net(in_dev->dev));
@@ -2707,6 +2719,12 @@ int __init ip_rt_init(void)
{
int rc = 0;
+ ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+ if (!ip_idents)
+ panic("IP: failed to allocate ip_idents\n");
+
+ prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 14a15c49129..c86624b36a6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -25,15 +25,7 @@
extern int sysctl_tcp_syncookies;
-__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
-EXPORT_SYMBOL(syncookie_secret);
-
-static __init int init_syncookies(void)
-{
- get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
- return 0;
-}
-__initcall(init_syncookies);
+static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -44,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
+ __u32 *tmp;
+
+ net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+ tmp = __get_cpu_var(ipv4_cookie_scratch);
memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
tmp[0] = (__force u32)saddr;
tmp[1] = (__force u32)daddr;
@@ -89,8 +84,7 @@ __u32 cookie_init_timestamp(struct request_sock *req)
static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
- __be16 dport, __u32 sseq, __u32 count,
- __u32 data)
+ __be16 dport, __u32 sseq, __u32 data)
{
/*
* Compute the secure sequence number.
@@ -102,7 +96,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* As an extra hack, we add a small "data" value that encodes the
* MSS into the second hash value.
*/
-
+ u32 count = tcp_cookie_time();
return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
sseq + (count << COOKIEBITS) +
((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
@@ -114,22 +108,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* If the syncookie is bad, the data returned will be out of
* range. This must be checked by the caller.
*
- * The count value used to generate the cookie must be within
- * "maxdiff" if the current (passed-in) "count". The return value
- * is (__u32)-1 if this test fails.
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
*/
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport, __u32 sseq,
- __u32 count, __u32 maxdiff)
+ __be16 sport, __be16 dport, __u32 sseq)
{
- __u32 diff;
+ u32 diff, count = tcp_cookie_time();
/* Strip away the layers from the cookie */
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
- diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
- if (diff >= maxdiff)
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
return (__u32)-1;
return (cookie -
@@ -138,22 +131,22 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
}
/*
- * MSS Values are taken from the 2009 paper
- * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
- * - values 1440 to 1460 accounted for 80% of observed mss values
- * - values outside the 536-1460 range are rare (<0.2%).
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ * .. lower than 536 are rare (< 0.2%)
+ * .. between 537 and 1299 account for less than < 1.5% of observed values
+ * .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ * .. exceeding 1460 are very rare (< 0.04%)
*
- * Table must be sorted.
+ * 1460 is the single most frequently announced mss value (30 to 46% depending
+ * on monitor location). Table must be sorted.
*/
static __u16 const msstab[] = {
- 64,
- 512,
536,
- 1024,
- 1440,
+ 1300,
+ 1440, /* 1440, 1452: PPPoE */
1460,
- 4312,
- 8960,
};
/*
@@ -173,7 +166,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
th->source, th->dest, ntohl(th->seq),
- jiffies / (HZ * 60), mssind);
+ mssind);
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
@@ -189,13 +182,6 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
}
/*
- * This (misnamed) value is the age of syncookie which is permitted.
- * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
- * sysctl_tcp_retries1. It's a rather complicated formula (exponential
- * backoff) to compute at runtime so it's currently hardcoded here.
- */
-#define COUNTER_TRIES 4
-/*
* Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
@@ -204,9 +190,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
{
__u32 seq = ntohl(th->seq) - 1;
__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
- th->source, th->dest, seq,
- jiffies / (HZ * 60),
- COUNTER_TRIES);
+ th->source, th->dest, seq);
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
@@ -315,10 +299,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
req->mss = mss;
- ireq->loc_port = th->dest;
- ireq->rmt_port = th->source;
- ireq->loc_addr = ip_hdr(skb)->daddr;
- ireq->rmt_addr = ip_hdr(skb)->saddr;
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_rmt_port = th->source;
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ ireq->ir_mark = inet_request_mark(sk, skb);
ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
@@ -355,11 +340,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
- (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
- ireq->loc_addr, th->source, th->dest);
+ (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, th->source, th->dest);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 540279f4c53..79a007c5255 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -43,12 +43,12 @@ static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
-static void set_local_port_range(int range[2])
+static void set_local_port_range(struct net *net, int range[2])
{
- write_seqlock(&sysctl_local_ports.lock);
- sysctl_local_ports.range[0] = range[0];
- sysctl_local_ports.range[1] = range[1];
- write_sequnlock(&sysctl_local_ports.lock);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = range[0];
+ net->ipv4.ip_local_ports.range[1] = range[1];
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -56,6 +56,8 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ip_local_ports.range);
int ret;
int range[2];
struct ctl_table tmp = {
@@ -66,14 +68,15 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
.extra2 = &ip_local_port_range_max,
};
- inet_get_local_port_range(range, range + 1);
+ inet_get_local_port_range(net, &range[0], &range[1]);
+
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
if (range[1] < range[0])
ret = -EINVAL;
else
- set_local_port_range(range);
+ set_local_port_range(net, range);
}
return ret;
@@ -83,23 +86,27 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
{
kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
unsigned int seq;
do {
- seq = read_seqbegin(&sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&sysctl_local_ports.lock, seq));
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
/* Update system visible IP port range */
static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
{
kgid_t *data = table->data;
- write_seqlock(&sysctl_local_ports.lock);
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
data[0] = low;
data[1] = high;
- write_sequnlock(&sysctl_local_ports.lock);
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -193,49 +200,6 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
-static int ipv4_tcp_mem(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
- unsigned long vec[3];
- struct net *net = current->nsproxy->net_ns;
-#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg;
-#endif
-
- struct ctl_table tmp = {
- .data = &vec,
- .maxlen = sizeof(vec),
- .mode = ctl->mode,
- };
-
- if (!write) {
- ctl->data = &net->ipv4.sysctl_tcp_mem;
- return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
- }
-
- ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (ret)
- return ret;
-
-#ifdef CONFIG_MEMCG_KMEM
- rcu_read_lock();
- memcg = mem_cgroup_from_task(current);
-
- tcp_prot_mem(memcg, vec[0], 0);
- tcp_prot_mem(memcg, vec[1], 1);
- tcp_prot_mem(memcg, vec[2], 2);
- rcu_read_unlock();
-#endif
-
- net->ipv4.sysctl_tcp_mem[0] = vec[0];
- net->ipv4.sysctl_tcp_mem[1] = vec[1];
- net->ipv4.sysctl_tcp_mem[2] = vec[2];
-
- return 0;
-}
-
static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -267,6 +231,11 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
ret = -EINVAL;
goto bad_key;
}
+ /* Generate a dummy secret but don't publish it. This
+ * is needed so we don't regenerate a new key on the
+ * first invocation of tcp_fastopen_cookie_gen
+ */
+ tcp_fastopen_init_key_once(false);
tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
}
@@ -317,13 +286,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &ip_ttl_max,
},
{
- .procname = "ip_no_pmtu_disc",
- .data = &ipv4_config.no_pmtu_disc,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "ip_nonlocal_bind",
.data = &sysctl_ip_nonlocal_bind,
.maxlen = sizeof(int),
@@ -475,20 +437,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_local_port_range",
- .data = &sysctl_local_ports.range,
- .maxlen = sizeof(sysctl_local_ports.range),
- .mode = 0644,
- .proc_handler = ipv4_local_port_range,
- },
- {
- .procname = "ip_local_reserved_ports",
- .data = NULL, /* initialized in sysctl_ipv4_init */
- .maxlen = 65536,
- .mode = 0644,
- .proc_handler = proc_do_large_bitmap,
- },
- {
.procname = "igmp_max_memberships",
.data = &sysctl_igmp_max_memberships,
.maxlen = sizeof(int),
@@ -552,6 +500,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_mem",
+ .maxlen = sizeof(sysctl_tcp_mem),
+ .data = &sysctl_tcp_mem,
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
.procname = "tcp_wmem",
.data = &sysctl_tcp_wmem,
.maxlen = sizeof(sysctl_tcp_wmem),
@@ -732,20 +687,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_allowed_congestion_control,
},
{
- .procname = "tcp_max_ssthresh",
- .data = &sysctl_tcp_max_ssthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
- {
+ {
.procname = "tcp_thin_dupack",
.data = &sysctl_tcp_thin_dupack,
.maxlen = sizeof(int),
@@ -771,6 +719,15 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &gso_max_segs,
},
{
+ .procname = "tcp_autocorking",
+ .data = &sysctl_tcp_autocorking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -841,7 +798,7 @@ static struct ctl_table ipv4_net_table[] = {
},
{
.procname = "ping_group_range",
- .data = &init_net.ipv4.sysctl_ping_group_range,
+ .data = &init_net.ipv4.ping_group_range.range,
.maxlen = sizeof(gid_t)*2,
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
@@ -854,10 +811,46 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_mem",
- .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
+ .procname = "ip_local_port_range",
+ .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
+ .data = &init_net.ipv4.ip_local_ports.range,
+ .mode = 0644,
+ .proc_handler = ipv4_local_port_range,
+ },
+ {
+ .procname = "ip_local_reserved_ports",
+ .data = &init_net.ipv4.sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "ip_no_pmtu_disc",
+ .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_forward_use_pmtu",
+ .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = ipv4_tcp_mem,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fwmark_accept",
+ .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
{ }
};
@@ -868,47 +861,29 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table = ipv4_net_table;
if (!net_eq(net, &init_net)) {
+ int i;
+
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
- table[0].data =
- &net->ipv4.sysctl_icmp_echo_ignore_all;
- table[1].data =
- &net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
- table[2].data =
- &net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
- table[3].data =
- &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
- table[4].data =
- &net->ipv4.sysctl_icmp_ratelimit;
- table[5].data =
- &net->ipv4.sysctl_icmp_ratemask;
- table[6].data =
- &net->ipv4.sysctl_ping_group_range;
- table[7].data =
- &net->ipv4.sysctl_tcp_ecn;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
}
- /*
- * Sane defaults - nobody may create ping sockets.
- * Boot scripts should set this to distro-specific group.
- */
- net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
- net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
-
- tcp_init_mem(net);
-
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
goto err_reg;
+ net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!net->ipv4.sysctl_local_reserved_ports)
+ goto err_ports;
+
return 0;
+err_ports:
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
err_reg:
if (!net_eq(net, &init_net))
kfree(table);
@@ -920,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
{
struct ctl_table *table;
+ kfree(net->ipv4.sysctl_local_reserved_ports);
table = net->ipv4.ipv4_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
kfree(table);
@@ -933,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
static __init int sysctl_ipv4_init(void)
{
struct ctl_table_header *hdr;
- struct ctl_table *i;
-
- for (i = ipv4_table; i->procname; i++) {
- if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
- i->data = sysctl_local_reserved_ports;
- break;
- }
- }
- if (!i->procname)
- return -EINVAL;
hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
if (hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6e5617b9f9d..9d2118e5fbc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -285,12 +285,16 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
int sysctl_tcp_min_tso_segs __read_mostly = 2;
+int sysctl_tcp_autocorking __read_mostly = 1;
+
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
+long sysctl_tcp_mem[3] __read_mostly;
int sysctl_tcp_wmem[3] __read_mostly;
int sysctl_tcp_rmem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_mem);
EXPORT_SYMBOL(sysctl_tcp_rmem);
EXPORT_SYMBOL(sysctl_tcp_wmem);
@@ -377,13 +381,13 @@ void tcp_init_sock(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- skb_queue_head_init(&tp->out_of_order_queue);
+ __skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
- tp->mdev = TCP_TIMEOUT_INIT;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -617,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
tp->snd_up = tp->write_seq;
}
-static inline void tcp_push(struct sock *sk, int flags, int mss_now,
- int nonagle)
+/* If a not yet filled skb is pushed, do not send it if
+ * we have data packets in Qdisc or NIC queues :
+ * Because TX completion will happen shortly, it gives a chance
+ * to coalesce future sendmsg() payload into this skb, without
+ * need for a timer, and with no latency trade off.
+ * As packets containing data payload have a bigger truesize
+ * than pure acks (dataless) packets, the last checks prevent
+ * autocorking if we only have an ACK in Qdisc/NIC queues,
+ * or if TX completion was delayed after we processed ACK packet.
+ */
+static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
+ int size_goal)
{
- if (tcp_send_head(sk)) {
- struct tcp_sock *tp = tcp_sk(sk);
+ return skb->len < size_goal &&
+ sysctl_tcp_autocorking &&
+ skb != tcp_write_queue_head(sk) &&
+ atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
+}
+
+static void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (!tcp_send_head(sk))
+ return;
+
+ skb = tcp_write_queue_tail(sk);
+ if (!(flags & MSG_MORE) || forced_push(tp))
+ tcp_mark_push(tp, skb);
- if (!(flags & MSG_MORE) || forced_push(tp))
- tcp_mark_push(tp, tcp_write_queue_tail(sk));
+ tcp_mark_urg(tp, flags);
- tcp_mark_urg(tp, flags);
- __tcp_push_pending_frames(sk, mss_now,
- (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
+ if (tcp_should_autocork(sk, skb, size_goal)) {
+
+ /* avoid atomic op if TSQ_THROTTLED bit is already set */
+ if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ }
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
+ return;
}
+
+ if (flags & MSG_MORE)
+ nonagle = TCP_NAGLE_CORK;
+
+ __tcp_push_pending_frames(sk, mss_now, nonagle);
}
static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
@@ -806,12 +849,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
xmit_size_goal = min_t(u32, gso_size,
sk->sk_gso_max_size - 1 - hlen);
- /* TSQ : try to have at least two segments in flight
- * (one in NIC TX ring, another in Qdisc)
- */
- xmit_size_goal = min_t(u32, xmit_size_goal,
- sysctl_tcp_limit_output_bytes >> 1);
-
xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
/* We try hard to avoid divides here */
@@ -938,7 +975,8 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -948,7 +986,7 @@ wait_for_memory:
out:
if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
- tcp_push(sk, flags, mss_now, tp->nonagle);
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
return copied;
do_error:
@@ -1006,7 +1044,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp)
}
}
-static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
+ int *copied, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
int err, flags;
@@ -1021,11 +1060,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
if (unlikely(tp->fastopen_req == NULL))
return -ENOBUFS;
tp->fastopen_req->data = msg;
+ tp->fastopen_req->size = size;
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
msg->msg_namelen, flags);
- *size = tp->fastopen_req->copied;
+ *copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp);
return err;
}
@@ -1045,7 +1085,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
flags = msg->msg_flags;
if (flags & MSG_FASTOPEN) {
- err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
@@ -1068,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
- goto out;
+ goto out_nopush;
}
err = -EINVAL;
@@ -1229,7 +1269,8 @@ wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -1240,7 +1281,8 @@ wait_for_memory:
out:
if (copied)
- tcp_push(sk, flags, mss_now, tp->nonagle);
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+out_nopush:
release_sock(sk);
return copied + copied_syn;
@@ -1429,7 +1471,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
do {
if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
last_issued, &done,
- &used) == DMA_SUCCESS) {
+ &used) == DMA_COMPLETE) {
/* Safe to free early-copied skbs now */
__skb_queue_purge(&sk->sk_async_wait_queue);
break;
@@ -1437,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
struct sk_buff *skb;
while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
(dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_SUCCESS)) {
+ used) == DMA_COMPLETE)) {
__skb_dequeue(&sk->sk_async_wait_queue);
kfree_skb(skb);
}
@@ -1627,11 +1669,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
net_dma_find_channel()) {
- preempt_enable_no_resched();
+ preempt_enable();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
} else {
- preempt_enable_no_resched();
+ preempt_enable();
}
}
#endif
@@ -2190,7 +2232,7 @@ adjudge_to_death:
/* This is a (useful) BSD violating of the RFC. There is a
* problem with TCP as specified in that the other end could
* keep a socket open forever with no application left this end.
- * We use a 3 minute timeout (about the same as BSD) then kill
+ * We use a 1 minute timeout (about the same as BSD) then kill
* our end. If they send after that then tough - BUT: long enough
* that we won't make the old 4*rto = almost no time - whoops
* reset mistake.
@@ -2300,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
- tp->srtt = 0;
+ tp->srtt_us = 0;
if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
@@ -2744,8 +2786,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
- info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
- info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+ info->tcpi_rtt = tp->srtt_us >> 3;
+ info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
@@ -2755,6 +2797,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
+
+ info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
+ sk->sk_pacing_rate : ~0ULL;
+ info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
+ sk->sk_max_pacing_rate : ~0ULL;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2870,6 +2917,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
+
+ case TCP_FASTOPEN:
+ if (icsk->icsk_accept_queue.fastopenq != NULL)
+ val = icsk->icsk_accept_queue.fastopenq->max_qlen;
+ else
+ val = 0;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
@@ -3097,13 +3152,13 @@ static int __init set_thash_entries(char *str)
}
__setup("thash_entries=", set_thash_entries);
-void tcp_init_mem(struct net *net)
+static void tcp_init_mem(void)
{
unsigned long limit = nr_free_buffer_pages() / 8;
limit = max(limit, 128UL);
- net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
- net->ipv4.sysctl_tcp_mem[1] = limit;
- net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+ sysctl_tcp_mem[0] = limit / 4 * 3;
+ sysctl_tcp_mem[1] = limit;
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
}
void __init tcp_init(void)
@@ -3137,10 +3192,9 @@ void __init tcp_init(void)
&tcp_hashinfo.ehash_mask,
0,
thash_entries ? 0 : 512 * 1024);
- for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
- INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
- }
+
if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
@@ -3166,7 +3220,7 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans = cnt / 2;
sysctl_max_syn_backlog = max(128, cnt / 256);
- tcp_init_mem(&init_net);
+ tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f45e1c24244..d5de69bc04f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,16 +140,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 019c2389a34..7b09d8b49fa 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -15,8 +15,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_max_ssthresh = 0;
-
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
@@ -278,56 +276,24 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
-/* RFC2861 Check whether we are limited by application or congestion window
- * This is the inverse of cwnd check in tcp_tso_should_defer
- */
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- u32 left;
-
- if (in_flight >= tp->snd_cwnd)
- return true;
-
- left = tp->snd_cwnd - in_flight;
- if (sk_can_gso(sk) &&
- left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
- left * tp->mss_cache < sk->sk_gso_max_size &&
- left < sk->sk_gso_max_segs)
- return true;
- return left <= tcp_max_tso_deferred_mss(tp);
-}
-EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-
-/*
- * Slow start is used when congestion window is less than slow start
- * threshold. This version implements the basic RFC2581 version
- * and optionally supports:
- * RFC3742 Limited Slow Start - growth limited to max_ssthresh
- * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp)
+int tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- int cnt; /* increase in packets */
- unsigned int delta = 0;
- u32 snd_cwnd = tp->snd_cwnd;
-
- if (unlikely(!snd_cwnd)) {
- pr_err_once("snd_cwnd is nul, please report this bug.\n");
- snd_cwnd = 1U;
- }
-
- if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
- cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
- else
- cnt = snd_cwnd; /* exponential increase */
+ u32 cwnd = tp->snd_cwnd + acked;
- tp->snd_cwnd_cnt += cnt;
- while (tp->snd_cwnd_cnt >= snd_cwnd) {
- tp->snd_cwnd_cnt -= snd_cwnd;
- delta++;
- }
- tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -351,16 +317,16 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
/* In dangerous area, increase slowly. */
else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
@@ -375,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-/* Lower bound on congestion window with halving. */
-u32 tcp_reno_min_cwnd(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- return tp->snd_ssthresh/2;
-}
-EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
-
struct tcp_congestion_ops tcp_reno = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
};
/* Initial congestion control used (until SYN)
@@ -401,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = {
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
};
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index b6ae92a51f5..a9bd8a4828a 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,18 +304,18 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
@@ -408,7 +408,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ratio += cnt;
- ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
+ ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
}
/* Some calls are for duplicates without timetamps */
@@ -475,10 +475,6 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
- /* hystart needs ms clock resolution */
- if (hystart && HZ < 1000)
- cubictcp.flags |= TCP_CONG_RTT_STAMP;
-
return tcp_register_congestion_control(&cubictcp);
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ab7bd35bb31..9771563ab56 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -8,12 +8,26 @@
#include <net/inetpeer.h>
#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly;
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+void tcp_fastopen_init_key_once(bool publish)
+{
+ static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ /* tcp_fastopen_reset_cipher publishes the new context
+ * atomically, so we allow this race happening here.
+ *
+ * All call sites of tcp_fastopen_cookie_gen also check
+ * for a valid cookie, so this is an acceptable risk.
+ */
+ if (net_get_random_once(key, sizeof(key)) && publish)
+ tcp_fastopen_reset_cipher(key, sizeof(key));
+}
+
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
struct tcp_fastopen_context *ctx =
@@ -58,34 +72,224 @@ error: kfree(ctx);
return err;
}
-/* Computes the fastopen cookie for the IP path.
- * The path is a 128 bits long (pad with zeros for IPv4).
- *
- * The caller must check foc->len to determine if a valid cookie
- * has been generated successfully.
-*/
-void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
- struct tcp_fastopen_cookie *foc)
+static bool __tcp_fastopen_cookie_gen(const void *path,
+ struct tcp_fastopen_cookie *foc)
{
- __be32 path[4] = { src, dst, 0, 0 };
struct tcp_fastopen_context *ctx;
+ bool ok = false;
+
+ tcp_fastopen_init_key_once(true);
rcu_read_lock();
ctx = rcu_dereference(tcp_fastopen_ctx);
if (ctx) {
- crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path);
+ crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ ok = true;
}
rcu_read_unlock();
+ return ok;
+}
+
+/* Generate the fastopen cookie by doing aes128 encryption on both
+ * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
+ * addresses. For the longer IPv6 addresses use CBC-MAC.
+ *
+ * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+ */
+static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ if (req->rsk_ops->family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(syn);
+
+ __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
+ return __tcp_fastopen_cookie_gen(path, foc);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (req->rsk_ops->family == AF_INET6) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+ struct tcp_fastopen_cookie tmp;
+
+ if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ int i = 4;
+
+ for (i = 0; i < 4; i++)
+ buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
+ return __tcp_fastopen_cookie_gen(buf, foc);
+ }
+ }
+#endif
+ return false;
+}
+
+static bool tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct tcp_sock *tp;
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ struct sock *child;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ return false;
+
+ spin_lock(&queue->fastopenq->lock);
+ queue->fastopenq->qlen++;
+ spin_unlock(&queue->fastopenq->lock);
+
+ /* Initialize the child socket. Have to fix some values to take
+ * into account the child is a Fast Open socket and is created
+ * only out of the bits carried in the SYN packet.
+ */
+ tp = tcp_sk(child);
+
+ tp->fastopen_rsk = req;
+ /* Do a hold on the listner sk so that if the listener is being
+ * closed, the child that has been accepted can live on and still
+ * access listen_lock.
+ */
+ sock_hold(sk);
+ tcp_rsk(req)->listener = sk;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never
+ * scaled. So correct it appropriately.
+ */
+ tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+ /* Activate the retrans timer so that SYNACK can be retransmitted.
+ * The request socket is not added to the SYN table of the parent
+ * because it's been added to the accept queue directly.
+ */
+ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, child);
+
+ /* Now finish processing the fastopen child socket. */
+ inet_csk(child)->icsk_af_ops->rebuild_header(child);
+ tcp_init_congestion_control(child);
+ tcp_mtup_init(child);
+ tcp_init_metrics(child);
+ tcp_init_buffer_space(child);
+
+ /* Queue the data carried in the SYN packet. We need to first
+ * bump skb's refcnt because the caller will attempt to free it.
+ *
+ * XXX (TFO) - we honor a zero-payload TFO request for now,
+ * (any reason not to?) but no need to queue the skb since
+ * there is no data. How about SYN+FIN?
+ */
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) {
+ skb = skb_get(skb);
+ skb_dst_drop(skb);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+ skb_set_owner_r(skb, child);
+ __skb_queue_tail(&child->sk_receive_queue, skb);
+ tp->syn_data_acked = 1;
+ }
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(child);
+ sock_put(child);
+ WARN_ON(req->sk == NULL);
+ return true;
}
+EXPORT_SYMBOL(tcp_fastopen_create_child);
-static int __init tcp_fastopen_init(void)
+static bool tcp_fastopen_queue_check(struct sock *sk)
{
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ struct fastopen_queue *fastopenq;
- get_random_bytes(key, sizeof(key));
- tcp_fastopen_reset_cipher(key, sizeof(key));
- return 0;
+ /* Make sure the listener has enabled fastopen, and we don't
+ * exceed the max # of pending TFO requests allowed before trying
+ * to validating the cookie in order to avoid burning CPU cycles
+ * unnecessarily.
+ *
+ * XXX (TFO) - The implication of checking the max_qlen before
+ * processing a cookie request is that clients can't differentiate
+ * between qlen overflow causing Fast Open to be disabled
+ * temporarily vs a server not supporting Fast Open at all.
+ */
+ fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq == NULL || fastopenq->max_qlen == 0)
+ return false;
+
+ if (fastopenq->qlen >= fastopenq->max_qlen) {
+ struct request_sock *req1;
+ spin_lock(&fastopenq->lock);
+ req1 = fastopenq->rskq_rst_head;
+ if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+ spin_unlock(&fastopenq->lock);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ return false;
+ }
+ fastopenq->rskq_rst_head = req1->dl_next;
+ fastopenq->qlen--;
+ spin_unlock(&fastopenq->lock);
+ reqsk_free(req1);
+ }
+ return true;
}
-late_initcall(tcp_fastopen_init);
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
+{
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+ bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+
+ if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ (syn_data || foc->len >= 0) &&
+ tcp_fastopen_queue_check(sk))) {
+ foc->len = -1;
+ return false;
+ }
+
+ if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ goto fastopen;
+
+ if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
+ foc->len == valid_foc.len &&
+ !memcmp(foc->val, valid_foc.val, foc->len)) {
+ /* Cookie is valid. Create a (full) child socket to accept
+ * the data in SYN before returning a SYN-ACK to ack the
+ * data. If we fail to create the socket, fall back and
+ * ack the ISN only but includes the same cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to send
+ * data in SYN_RECV state.
+ */
+fastopen:
+ if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ foc->len = -1;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return true;
+ }
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), foc->len ?
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
+ LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+ *foc = valid_foc;
+ return false;
+}
+EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 30f27f6b365..1c4908280d9 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,16 +109,16 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
*
@@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
.cong_avoid = hstcp_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.owner = THIS_MODULE,
.name = "highspeed"
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index c1a8175361e..031361311a8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,16 +227,16 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 57bdd17dff4..d8f8f05a495 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -21,7 +21,7 @@ struct hybla {
u32 rho2; /* Rho * Rho, integer part */
u32 rho_3ls; /* Rho parameter, <<3 */
u32 rho2_7ls; /* Rho^2, <<7 */
- u32 minrtt; /* Minimum smoothed round trip time value seen */
+ u32 minrtt_us; /* Minimum smoothed round trip time value seen */
};
/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)
{
struct hybla *ca = inet_csk_ca(sk);
- ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
+ ca->rho_3ls = max_t(u32,
+ tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+ 8U);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
ca->rho2 = ca->rho2_7ls >> 7;
@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)
hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */
- ca->minrtt = tp->srtt;
+ ca->minrtt_us = tp->srtt_us;
tp->snd_cwnd = ca->rho;
}
@@ -85,7 +87,7 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -93,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */
- if (tp->srtt < ca->minrtt){
+ if (tp->srtt_us < ca->minrtt_us) {
hybla_recalc_param(sk);
- ca->minrtt = tp->srtt;
+ ca->minrtt_us = tp->srtt_us;
}
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (!ca->hybla_en) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
@@ -165,7 +167,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
- .min_cwnd = tcp_reno_min_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 834857f3c87..5999b3972e6 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -23,7 +23,6 @@
#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
-#define U32_MAX ((u32)~0U)
#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
#define BETA_SHIFT 6
@@ -256,7 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
@@ -265,12 +264,12 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
update_params(sk);
/* RFC2861 only increase cwnd if fully utilized */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In slow start */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
u32 delta;
@@ -325,10 +324,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
}
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
- .flags = TCP_CONG_RTT_STAMP,
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
- .min_cwnd = tcp_reno_min_cwnd,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 25a89eaa669..40639c288dc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
*/
-static void tcp_fixup_sndbuf(struct sock *sk)
+static void tcp_sndbuf_expand(struct sock *sk)
{
- int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int sndmem, per_mss;
+ u32 nr_segs;
+
+ /* Worst case is non GSO/TSO : each frame consumes one skb
+ * and skb->head is kmalloced using power of two area of memory
+ */
+ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+ MAX_TCP_HEADER +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ per_mss = roundup_pow_of_two(per_mss) +
+ SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+ /* Fast Recovery (RFC 5681 3.2) :
+ * Cubic needs 1.7 factor, rounded to 2 to include
+ * extra cushion (application might react slowly to POLLOUT)
+ */
+ sndmem = 2 * nr_segs * per_mss;
- sndmem *= TCP_INIT_CWND;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
}
@@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
tcp_default_init_rwnd(mss);
+ /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+ * Allow enough cushion so that sender is not limited by our window
+ */
+ if (sysctl_tcp_moderate_rcvbuf)
+ rcvmem <<= 2;
+
if (sk->sk_rcvbuf < rcvmem)
sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
}
@@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk)
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
- tcp_fixup_sndbuf(sk);
+ tcp_sndbuf_expand(sk);
tp->rcvq_space.space = tp->rcv_wnd;
+ tp->rcvq_space.time = tcp_time_stamp;
+ tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
@@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int time;
- int space;
-
- if (tp->rcvq_space.time == 0)
- goto new_measure;
+ int copied;
time = tcp_time_stamp - tp->rcvq_space.time;
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
return;
- space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+ /* Number of bytes copied to user in last RTT */
+ copied = tp->copied_seq - tp->rcvq_space.seq;
+ if (copied <= tp->rcvq_space.space)
+ goto new_measure;
- space = max(tp->rcvq_space.space, space);
+ /* A bit of theory :
+ * copied = bytes received in previous RTT, our base window
+ * To cope with packet losses, we need a 2x factor
+ * To cope with slow start, and sender growing its cwin by 100 %
+ * every RTT, we need a 4x factor, because the ACK we are sending
+ * now is for the next RTT, not the current one :
+ * <prev RTT . ><current RTT .. ><next RTT .... >
+ */
- if (tp->rcvq_space.space != space) {
- int rcvmem;
+ if (sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvwin, rcvmem, rcvbuf;
- tp->rcvq_space.space = space;
+ /* minimal window to cope with packet losses, assuming
+ * steady state. Add some cushion because of small variations.
+ */
+ rcvwin = (copied << 1) + 16 * tp->advmss;
- if (sysctl_tcp_moderate_rcvbuf &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int new_clamp = space;
+ /* If rate increased by 25%,
+ * assume slow start, rcvwin = 3 * copied
+ * If rate increased by 50%,
+ * assume sender can use 2x growth, rcvwin = 4 * copied
+ */
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+ rcvwin <<= 1;
+ else
+ rcvwin += (rcvwin >> 1);
+ }
- /* Receive space grows, normalize in order to
- * take into account packet headers and sk_buff
- * structure overhead.
- */
- space /= tp->advmss;
- if (!space)
- space = 1;
- rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(rcvmem) < tp->advmss)
- rcvmem += 128;
- space *= rcvmem;
- space = min(space, sysctl_tcp_rmem[2]);
- if (space > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = space;
-
- /* Make the window clamp follow along. */
- tp->window_clamp = new_clamp;
- }
+ rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
+
+ rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = rcvbuf;
+
+ /* Make the window clamp follow along. */
+ tp->window_clamp = rcvwin;
}
}
+ tp->rcvq_space.space = copied;
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
@@ -625,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
- long m = mrtt; /* RTT */
+ long m = mrtt_us; /* RTT */
+ u32 srtt = tp->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@@ -646,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
*/
- if (m == 0)
- m = 1;
- if (tp->srtt != 0) {
- m -= (tp->srtt >> 3); /* m is now error in rtt est */
- tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (srtt != 0) {
+ m -= (srtt >> 3); /* m is now error in rtt est */
+ srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@@ -665,27 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
if (m > 0)
m >>= 3;
} else {
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
}
- tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
- if (tp->mdev > tp->mdev_max) {
- tp->mdev_max = tp->mdev;
- if (tp->mdev_max > tp->rttvar)
- tp->rttvar = tp->mdev_max;
+ tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev_us > tp->mdev_max_us) {
+ tp->mdev_max_us = tp->mdev_us;
+ if (tp->mdev_max_us > tp->rttvar_us)
+ tp->rttvar_us = tp->mdev_max_us;
}
if (after(tp->snd_una, tp->rtt_seq)) {
- if (tp->mdev_max < tp->rttvar)
- tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
+ if (tp->mdev_max_us < tp->rttvar_us)
+ tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
- tp->mdev_max = tcp_rto_min(sk);
+ tp->mdev_max_us = tcp_rto_min_us(sk);
}
} else {
/* no previous measure. */
- tp->srtt = m << 3; /* take the measured time to be rtt */
- tp->mdev = m << 1; /* make sure rto = 3*rtt */
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+ srtt = m << 3; /* take the measured time to be rtt */
+ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+ tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
}
+ tp->srtt_us = max(1U, srtt);
}
/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@@ -700,26 +743,25 @@ static void tcp_update_pacing_rate(struct sock *sk)
u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
- rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+ rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
rate *= max(tp->snd_cwnd, tp->packets_out);
- /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
- * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
- * We probably need usec resolution in the future.
- * Note: This also takes care of possible srtt=0 case,
- * when tcp_rtt_estimator() was not yet called.
- */
- if (tp->srtt > 8 + 2)
- do_div(rate, tp->srtt);
+ if (likely(tp->srtt_us))
+ do_div(rate, tp->srtt_us);
- sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+ /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ * without any lock. We want to make sure compiler wont store
+ * intermediate values in this location.
+ */
+ ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
+ sk->sk_max_pacing_rate);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-void tcp_set_rto(struct sock *sk)
+static void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -1064,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
}
/* D-SACK for already forgotten data... Do dumb counting. */
- if (dup_sack && tp->undo_marker && tp->undo_retrans &&
+ if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
tp->undo_retrans--;
@@ -1073,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
}
struct tcp_sacktag_state {
- int reord;
- int fack_count;
- int flag;
- s32 rtt; /* RTT measured by SACKing never-retransmitted data */
+ int reord;
+ int fack_count;
+ long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ int flag;
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1120,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
unsigned int new_len = (pkt_len / mss) * mss;
if (!in_sack && new_len < pkt_len) {
new_len += mss;
- if (new_len > skb->len)
+ if (new_len >= skb->len)
return 0;
}
pkt_len = new_len;
}
- err = tcp_fragment(sk, skb, pkt_len, mss);
+ err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
}
@@ -1137,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount, u32 xmit_time)
+ int dup_sack, int pcount,
+ const struct skb_mstamp *xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
- if (tp->undo_marker && tp->undo_retrans &&
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
@@ -1178,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
/* Pick the earliest sequence sacked for RTT */
- if (state->rtt < 0)
- state->rtt = tcp_time_stamp - xmit_time;
+ if (state->rtt_us < 0) {
+ struct skb_mstamp now;
+
+ skb_mstamp_get(&now);
+ state->rtt_us = skb_mstamp_us_delta(&now,
+ xmit_time);
+ }
}
if (sacked & TCPCB_LOST) {
@@ -1238,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
- TCP_SKB_CB(skb)->when);
+ &skb->skb_mstamp);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1284,7 +1332,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
}
- TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags;
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
@@ -1513,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
- TCP_SKB_CB(skb)->when);
+ &skb->skb_mstamp);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1570,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
- u32 prior_snd_una, s32 *sack_rtt)
+ u32 prior_snd_una, long *sack_rtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1588,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
state.flag = 0;
state.reord = tp->packets_out;
- state.rtt = -1;
+ state.rtt_us = -1L;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1772,7 +1823,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
- *sack_rtt = state.rtt;
+ *sack_rtt_us = state.rtt_us;
return state.flag;
}
@@ -1842,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)
tp->lost_out = 0;
tp->undo_marker = 0;
- tp->undo_retrans = 0;
+ tp->undo_retrans = -1;
}
void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1893,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how)
if (skb == tcp_send_head(sk))
break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->undo_marker = 0;
+
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
@@ -1982,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* available, or RTO is scheduled to fire first.
*/
if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
- (flag & FLAG_ECE) || !tp->srtt)
+ (flag & FLAG_ECE) || !tp->srtt_us)
return false;
- delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+ delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
+ msecs_to_jiffies(2));
+
if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
return false;
@@ -2187,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
break;
mss = skb_shinfo(skb)->gso_size;
- err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+ err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
+ mss, GFP_ATOMIC);
if (err < 0)
break;
cnt = packets;
@@ -2610,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0;
tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
+ tp->undo_retrans = tp->retrans_out ? : -1;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
@@ -2630,13 +2685,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
bool recovered = !before(tp->snd_una, tp->high_seq);
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
- if (flag & FLAG_ORIG_SACK_ACKED) {
- /* Step 3.b. A timeout is spurious if not all data are
- * lost, i.e., never-retransmitted data are (s)acked.
- */
- tcp_try_undo_loss(sk, true);
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
return;
- }
+
if (after(tp->snd_nxt, tp->high_seq) &&
(flag & FLAG_DATA_SACKED || is_dupack)) {
tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
@@ -2832,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- s32 seq_rtt, s32 sack_rtt)
+ long seq_rtt_us, long sack_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2842,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* is acked (RFC6298).
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
- seq_rtt = -1;
+ seq_rtt_us = -1L;
- if (seq_rtt < 0)
- seq_rtt = sack_rtt;
+ if (seq_rtt_us < 0)
+ seq_rtt_us = sack_rtt_us;
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
@@ -2853,13 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
- if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ flag & FLAG_ACKED)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
- if (seq_rtt < 0)
+ if (seq_rtt_us < 0)
return false;
- tcp_rtt_estimator(sk, seq_rtt);
+ tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
/* RFC6298: only reset backoff on valid RTT measurement. */
@@ -2868,20 +2923,26 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
{
struct tcp_sock *tp = tcp_sk(sk);
- s32 seq_rtt = -1;
+ long seq_rtt_us = -1L;
+
+ if (synack_stamp && !tp->total_retrans)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
- if (tp->lsndtime && !tp->total_retrans)
- seq_rtt = tcp_time_stamp - tp->lsndtime;
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+ /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+ * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+ */
+ if (!tp->srtt_us)
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
+
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -2964,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, s32 sack_rtt)
+ u32 prior_snd_una, long sack_rtt_us)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct skb_mstamp first_ackt, last_ackt, now;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ u32 reord = tp->packets_out;
+ bool fully_acked = true;
+ long ca_seq_rtt_us = -1L;
+ long seq_rtt_us = -1L;
struct sk_buff *skb;
- u32 now = tcp_time_stamp;
- int fully_acked = true;
- int flag = 0;
u32 pkts_acked = 0;
- u32 reord = tp->packets_out;
- u32 prior_sacked = tp->sacked_out;
- s32 seq_rtt = -1;
- s32 ca_seq_rtt = -1;
- ktime_t last_ackt = net_invalid_timestamp();
+ bool rtt_update;
+ int flag = 0;
+
+ first_ackt.v64 = 0;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u32 acked_pcount;
u8 sacked = scb->sacked;
+ u32 acked_pcount;
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
@@ -3004,11 +3067,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else {
- ca_seq_rtt = now - scb->when;
- last_ackt = skb->tstamp;
- if (seq_rtt < 0) {
- seq_rtt = ca_seq_rtt;
- }
+ last_ackt = skb->skb_mstamp;
+ WARN_ON_ONCE(last_ackt.v64 == 0);
+ if (!first_ackt.v64)
+ first_ackt = last_ackt;
+
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
if (!after(scb->end_seq, tp->high_seq))
@@ -3054,14 +3117,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
- (flag & FLAG_ACKED))
- tcp_rearm_rto(sk);
+ skb_mstamp_get(&now);
+ if (first_ackt.v64) {
+ seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
+ ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
+ tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
@@ -3083,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- if (ca_ops->pkts_acked) {
- s32 rtt_us = -1;
-
- /* Is the ACK triggering packet unambiguous? */
- if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
- /* High resolution needed and available? */
- if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
- !ktime_equal(last_ackt,
- net_invalid_timestamp()))
- rtt_us = ktime_us_delta(ktime_get_real(),
- last_ackt);
- else if (ca_seq_rtt >= 0)
- rtt_us = jiffies_to_usecs(ca_seq_rtt);
- }
+ if (ca_ops->pkts_acked)
+ ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
- ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
- }
+ } else if (skb && rtt_update && sack_rtt_us >= 0 &&
+ sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ tcp_rearm_rto(sk);
}
#if FASTRETRANS_DEBUG > 0
@@ -3288,7 +3349,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tcp_init_cwnd_reduction(sk, true);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
- tcp_set_ca_state(sk, TCP_CA_Open);
+ tcp_try_keep_open(sk);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSSPROBERECOVERY);
}
@@ -3304,12 +3365,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
u32 prior_fackets;
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
- s32 sack_rtt = -1;
+ long sack_rtt_us = -1L;
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3337,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= FLAG_SND_UNA_ADVANCED;
prior_fackets = tp->fackets_out;
- prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3367,7 +3426,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt);
+ &sack_rtt_us);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
@@ -3386,12 +3445,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ sack_rtt_us);
acked -= tp->packets_out;
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, prior_in_flight);
+ tcp_cong_avoid(sk, ack, acked);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3409,8 +3469,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
- tcp_update_pacing_rate(sk);
+ tcp_update_pacing_rate(sk);
return 1;
no_queue:
@@ -3439,7 +3498,7 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt);
+ &sack_rtt_us);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
@@ -3623,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
int opcode = *ptr++;
int opsize;
- switch(opcode) {
+ switch (opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP:
@@ -3983,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
WARN_ON(before(tp->rcv_nxt, sp->end_seq));
/* Zap this SACK, by moving forward any other SACKS. */
- for (i=this_sack+1; i < num_sacks; i++)
+ for (i = this_sack+1; i < num_sacks; i++)
tp->selective_acks[i-1] = tp->selective_acks[i];
num_sacks--;
continue;
@@ -4353,7 +4412,7 @@ queue_and_out:
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return;
}
@@ -4643,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk)
return -1;
}
-/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
- * As additional protections, we do not touch cwnd in retransmission phases,
- * and if application hit its sndbuf limit recently.
- */
-void tcp_cwnd_application_limited(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
- sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- /* Limited by application or receiver window. */
- u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
- u32 win_used = max(tp->snd_cwnd_used, init_win);
- if (win_used < tp->snd_cwnd) {
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
- }
- tp->snd_cwnd_used = 0;
- }
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -4701,15 +4738,7 @@ static void tcp_new_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk)) {
- int sndmem = SKB_TRUESIZE(max_t(u32,
- tp->rx_opt.mss_clamp,
- tp->mss_cache) +
- MAX_TCP_HEADER);
- int demanded = max_t(unsigned int, tp->snd_cwnd,
- tp->reordering + 1);
- sndmem *= 2 * demanded;
- if (sndmem > sk->sk_sndbuf)
- sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+ tcp_sndbuf_expand(sk);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -4862,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
BUG();
tp->urg_data = TCP_URG_VALID | tmp;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
}
}
@@ -4948,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
(tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
(atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
} else if (chunk > 0) {
tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
out:
return copied_early;
@@ -5223,7 +5252,7 @@ no_ack:
#endif
if (eaten)
kfree_skb_partial(skb, fragstolen);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return;
}
}
@@ -5343,9 +5372,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
break;
}
tcp_rearm_rto(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
}
tp->syn_data_acked = tp->syn_data;
+ if (tp->syn_data_acked)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
return false;
}
@@ -5584,6 +5616,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct request_sock *req;
int queued = 0;
bool acceptable;
+ u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5666,16 +5699,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it.
*/
if (req) {
+ synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
+ synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
tcp_mtup_init(sk);
- tcp_init_buffer_space(sk);
tp->copied_seq = tp->rcv_nxt;
+ tcp_init_buffer_space(sk);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5691,7 +5726,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, req);
+ tcp_synack_rtt_meas(sk, synack_stamp);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5709,6 +5744,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
} else
tcp_init_metrics(sk);
+ tcp_update_pacing_rate(sk);
+
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b14266bb91e..77cccda1ad0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -173,11 +173,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
- orig_sport, orig_dport, sk, true);
+ orig_sport, orig_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
@@ -288,6 +288,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
@@ -335,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
const int code = icmp_hdr(icmp_skb)->code;
struct sock *sk;
struct sk_buff *skb;
- struct request_sock *req;
- __u32 seq;
+ struct request_sock *fastopen;
+ __u32 seq, snd_una;
__u32 remaining;
int err;
struct net *net = dev_net(icmp_skb->dev);
@@ -377,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk);
tp = tcp_sk(sk);
- req = tp->fastopen_rsk;
seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
- !between(seq, tp->snd_una, tp->snd_nxt) &&
- (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
- /* For a Fast Open socket, allow seq to be snt_isn. */
+ !between(seq, snd_una, tp->snd_nxt)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -425,16 +426,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
break;
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
- !icsk->icsk_backoff)
+ !icsk->icsk_backoff || fastopen)
break;
- /* XXX (TFO) - revisit the following logic for TFO */
-
if (sock_owned_by_user(sk))
break;
icsk->icsk_backoff--;
- inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
+ inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
tcp_bound_rto(sk);
@@ -461,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
}
- /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
- * than following the TCP_SYN_RECV case and closing the socket,
- * we ignore the ICMP error and keep trying like a fully established
- * socket. Is this the right thing to do?
- */
- if (req && req->sk == NULL)
- goto out;
-
switch (sk->sk_state) {
struct request_sock *req, **prev;
case TCP_LISTEN:
@@ -501,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
case TCP_SYN_SENT:
- case TCP_SYN_RECV: /* Cannot happen.
- It can f.e. if SYNs crossed,
- or Fast Open.
- */
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * is already accepted it is treated as a connected one below.
+ */
+ if (fastopen && fastopen->sk == NULL)
+ break;
+
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
@@ -821,25 +815,26 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- u16 queue_mapping)
+ u16 queue_mapping,
+ struct tcp_fastopen_cookie *foc)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
- struct sk_buff * skb;
+ struct sk_buff *skb;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, NULL);
+ skb = tcp_make_synack(sk, dst, req, foc);
if (skb) {
- __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
+ __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
skb_set_queue_mapping(skb, queue_mapping);
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
if (!tcp_rsk(req)->snt_synack && !err)
@@ -851,10 +846,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
{
- int res = tcp_v4_send_synack(sk, NULL, req, 0);
+ int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
- if (!res)
+ if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ }
return res;
}
@@ -877,8 +874,6 @@ bool tcp_syn_flood_action(struct sock *sk,
bool want_cookie = false;
struct listen_sock *lopt;
-
-
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
msg = "Sending cookies";
@@ -972,7 +967,7 @@ static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
{
union tcp_md5_addr *addr;
- addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
+ addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
return tcp_md5_do_lookup(sk, addr, AF_INET);
}
@@ -1149,8 +1144,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
saddr = inet_sk(sk)->inet_saddr;
daddr = inet_sk(sk)->inet_daddr;
} else if (req) {
- saddr = inet_rsk(req)->loc_addr;
- daddr = inet_rsk(req)->rmt_addr;
+ saddr = inet_rsk(req)->ir_loc_addr;
+ daddr = inet_rsk(req)->ir_rmt_addr;
} else {
const struct iphdr *iph = ip_hdr(skb);
saddr = iph->saddr;
@@ -1259,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
};
#endif
-static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct tcp_fastopen_cookie *valid_foc)
-{
- bool skip_cookie = false;
- struct fastopen_queue *fastopenq;
-
- if (likely(!fastopen_cookie_present(foc))) {
- /* See include/net/tcp.h for the meaning of these knobs */
- if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
- ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
- (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
- skip_cookie = true; /* no cookie to validate */
- else
- return false;
- }
- fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- /* A FO option is present; bump the counter. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
-
- /* Make sure the listener has enabled fastopen, and we don't
- * exceed the max # of pending TFO requests allowed before trying
- * to validating the cookie in order to avoid burning CPU cycles
- * unnecessarily.
- *
- * XXX (TFO) - The implication of checking the max_qlen before
- * processing a cookie request is that clients can't differentiate
- * between qlen overflow causing Fast Open to be disabled
- * temporarily vs a server not supporting Fast Open at all.
- */
- if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
- fastopenq == NULL || fastopenq->max_qlen == 0)
- return false;
-
- if (fastopenq->qlen >= fastopenq->max_qlen) {
- struct request_sock *req1;
- spin_lock(&fastopenq->lock);
- req1 = fastopenq->rskq_rst_head;
- if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
- spin_unlock(&fastopenq->lock);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
- /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
- foc->len = -1;
- return false;
- }
- fastopenq->rskq_rst_head = req1->dl_next;
- fastopenq->qlen--;
- spin_unlock(&fastopenq->lock);
- reqsk_free(req1);
- }
- if (skip_cookie) {
- tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- return true;
- }
-
- if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
- if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, valid_foc);
- if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
- memcmp(&foc->val[0], &valid_foc->val[0],
- TCP_FASTOPEN_COOKIE_SIZE) != 0)
- return false;
- valid_foc->len = -1;
- }
- /* Acknowledge the data received from the peer. */
- tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- return true;
- } else if (foc->len == 0) { /* Client requesting a cookie */
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, valid_foc);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENCOOKIEREQD);
- } else {
- /* Client sent a cookie with wrong size. Treat it
- * the same as invalid and return a valid one.
- */
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, valid_foc);
- }
- return false;
-}
-
-static int tcp_v4_conn_req_fastopen(struct sock *sk,
- struct sk_buff *skb,
- struct sk_buff *skb_synack,
- struct request_sock *req)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct sock *child;
- int err;
-
- req->num_retrans = 0;
- req->num_timeout = 0;
- req->sk = NULL;
-
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- kfree_skb(skb_synack);
- return -1;
- }
- err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
- ireq->rmt_addr, ireq->opt);
- err = net_xmit_eval(err);
- if (!err)
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- /* XXX (TFO) - is it ok to ignore error and continue? */
-
- spin_lock(&queue->fastopenq->lock);
- queue->fastopenq->qlen++;
- spin_unlock(&queue->fastopenq->lock);
-
- /* Initialize the child socket. Have to fix some values to take
- * into account the child is a Fast Open socket and is created
- * only out of the bits carried in the SYN packet.
- */
- tp = tcp_sk(child);
-
- tp->fastopen_rsk = req;
- /* Do a hold on the listner sk so that if the listener is being
- * closed, the child that has been accepted can live on and still
- * access listen_lock.
- */
- sock_hold(sk);
- tcp_rsk(req)->listener = sk;
-
- /* RFC1323: The window in SYN & SYN/ACK segments is never
- * scaled. So correct it appropriately.
- */
- tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
-
- /* Activate the retrans timer so that SYNACK can be retransmitted.
- * The request socket is not added to the SYN table of the parent
- * because it's been added to the accept queue directly.
- */
- inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
- TCP_TIMEOUT_INIT, TCP_RTO_MAX);
-
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
-
- /* Now finish processing the fastopen child socket. */
- inet_csk(child)->icsk_af_ops->rebuild_header(child);
- tcp_init_congestion_control(child);
- tcp_mtup_init(child);
- tcp_init_buffer_space(child);
- tcp_init_metrics(child);
-
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- *
- * XXX (TFO) - we honor a zero-payload TFO request for now.
- * (Any reason not to?)
- */
- if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
- /* Don't queue the skb if there is no payload in SYN.
- * XXX (TFO) - How about SYN+FIN?
- */
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- } else {
- skb = skb_get(skb);
- skb_dst_drop(skb);
- __skb_pull(skb, tcp_hdr(skb)->doff * 4);
- skb_set_owner_r(skb, child);
- __skb_queue_tail(&child->sk_receive_queue, skb);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->syn_data_acked = 1;
- }
- sk->sk_data_ready(sk, 0);
- bh_unlock_sock(child);
- sock_put(child);
- WARN_ON(req->sk == NULL);
- return 0;
-}
-
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_options_received tmp_opt;
@@ -1450,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- bool want_cookie = false;
+ bool want_cookie = false, fastopen;
struct flowi4 fl4;
struct tcp_fastopen_cookie foc = { .len = -1 };
- struct tcp_fastopen_cookie valid_foc = { .len = -1 };
- struct sk_buff *skb_synack;
- int do_fastopen;
+ int err;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1502,10 +1314,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
- ireq->loc_addr = daddr;
- ireq->rmt_addr = saddr;
+ ireq->ir_loc_addr = daddr;
+ ireq->ir_rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(skb);
+ ireq->ir_mark = inet_request_mark(sk, skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
@@ -1554,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
isn = tcp_v4_init_sequence(skb);
}
- tcp_rsk(req)->snt_isn = isn;
-
- if (dst == NULL) {
- dst = inet_csk_route_req(sk, &fl4, req);
- if (dst == NULL)
- goto drop_and_free;
- }
- do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
-
- /* We don't call tcp_v4_send_synack() directly because we need
- * to make sure a child socket can be created successfully before
- * sending back synack!
- *
- * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
- * (or better yet, call tcp_send_synack() in the child context
- * directly, but will have to fix bunch of other code first)
- * after syn_recv_sock() except one will need to first fix the
- * latter to remove its dependency on the current implementation
- * of tcp_v4_send_synack()->tcp_select_initial_window().
- */
- skb_synack = tcp_make_synack(sk, dst, req,
- fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
-
- if (skb_synack) {
- __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
- skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
- } else
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
goto drop_and_free;
- if (likely(!do_fastopen)) {
- int err;
- err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
- ireq->rmt_addr, ireq->opt);
- err = net_xmit_eval(err);
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = tcp_v4_send_synack(sk, dst, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
if (err || want_cookie)
goto drop_and_free;
tcp_rsk(req)->snt_synack = tcp_time_stamp;
tcp_rsk(req)->listener = NULL;
- /* Add the request_sock to the SYN table */
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- if (fastopen_cookie_present(&foc) && foc.len != 0)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
- goto drop_and_free;
+ }
return 0;
@@ -1644,9 +1429,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->inet_daddr = ireq->rmt_addr;
- newinet->inet_rcv_saddr = ireq->loc_addr;
- newinet->inet_saddr = ireq->loc_addr;
+ newinet->inet_daddr = ireq->ir_rmt_addr;
+ newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = ireq->opt;
rcu_assign_pointer(newinet->inet_opt, inet_opt);
ireq->opt = NULL;
@@ -1667,7 +1452,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
sk_setup_caps(newsk, dst);
- tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
@@ -1744,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
return sk;
}
-static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
-
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!tcp_v4_check(skb->len, iph->saddr,
- iph->daddr, skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- return 0;
- }
- }
-
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len, IPPROTO_TCP, 0);
-
- if (skb->len <= 76) {
- return __skb_checksum_complete(skb);
- }
- return 0;
-}
-
-
/* The socket must have it's spinlock held when we get
* here.
*
@@ -1960,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
- if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
+
+ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
th = tcp_hdr(skb);
@@ -2194,18 +1957,6 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
-{
- return hlist_nulls_empty(head) ? NULL :
- list_entry(head->first, struct inet_timewait_sock, tw_node);
-}
-
-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
-{
- return !is_a_nulls(tw->tw_node.next) ?
- hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
-}
-
/*
* Get next listener socket follow cur. If cur is NULL, get first socket
* starting from bucket given in st->bucket; when st->bucket is zero the
@@ -2309,10 +2060,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline bool empty_bucket(struct tcp_iter_state *st)
+static inline bool empty_bucket(const struct tcp_iter_state *st)
{
- return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
- hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+ return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
}
/*
@@ -2329,7 +2079,6 @@ static void *established_get_first(struct seq_file *seq)
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
- struct inet_timewait_sock *tw;
spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
/* Lockless fast path for the common case of empty buckets */
@@ -2345,18 +2094,7 @@ static void *established_get_first(struct seq_file *seq)
rc = sk;
goto out;
}
- st->state = TCP_SEQ_STATE_TIME_WAIT;
- inet_twsk_for_each(tw, node,
- &tcp_hashinfo.ehash[st->bucket].twchain) {
- if (tw->tw_family != st->family ||
- !net_eq(twsk_net(tw), net)) {
- continue;
- }
- rc = tw;
- goto out;
- }
spin_unlock_bh(lock);
- st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
return rc;
@@ -2365,7 +2103,6 @@ out:
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct sock *sk = cur;
- struct inet_timewait_sock *tw;
struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
@@ -2373,45 +2110,16 @@ static void *established_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
- tw = cur;
- tw = tw_next(tw);
-get_tw:
- while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
- tw = tw_next(tw);
- }
- if (tw) {
- cur = tw;
- goto out;
- }
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
- st->state = TCP_SEQ_STATE_ESTABLISHED;
-
- /* Look for next non empty bucket */
- st->offset = 0;
- while (++st->bucket <= tcp_hashinfo.ehash_mask &&
- empty_bucket(st))
- ;
- if (st->bucket > tcp_hashinfo.ehash_mask)
- return NULL;
-
- spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
- sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
- } else
- sk = sk_nulls_next(sk);
+ sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
- goto found;
+ return sk;
}
- st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
- goto get_tw;
-found:
- cur = sk;
-out:
- return cur;
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ ++st->bucket;
+ return established_get_first(seq);
}
static void *established_get_idx(struct seq_file *seq, loff_t pos)
@@ -2464,10 +2172,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
if (rc)
break;
st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
/* Fallthrough */
case TCP_SEQ_STATE_ESTABLISHED:
- case TCP_SEQ_STATE_TIME_WAIT:
- st->state = TCP_SEQ_STATE_ESTABLISHED;
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
@@ -2524,7 +2231,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
break;
case TCP_SEQ_STATE_ESTABLISHED:
- case TCP_SEQ_STATE_TIME_WAIT:
rc = established_get_next(seq, v);
break;
}
@@ -2548,7 +2254,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
- case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2598,18 +2303,18 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct sock *sk, const struct request_sock *req,
- struct seq_file *f, int i, kuid_t uid, int *len)
+ struct seq_file *f, int i, kuid_t uid)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
i,
- ireq->loc_addr,
+ ireq->ir_loc_addr,
ntohs(inet_sk(sk)->inet_sport),
- ireq->rmt_addr,
- ntohs(ireq->rmt_port),
+ ireq->ir_rmt_addr,
+ ntohs(ireq->ir_rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
@@ -2619,11 +2324,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
0, /* non standard timer */
0, /* open_requests have no inode */
atomic_read(&sk->sk_refcnt),
- req,
- len);
+ req);
}
-static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
{
int timer_active;
unsigned long timer_expires;
@@ -2662,7 +2366,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
- "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
i, src, srcp, dest, destp, sk->sk_state,
tp->write_seq - tp->snd_una,
rx_queue,
@@ -2679,16 +2383,15 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
tp->snd_cwnd,
sk->sk_state == TCP_LISTEN ?
(fastopenq ? fastopenq->max_qlen : 0) :
- (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
- len);
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
- struct seq_file *f, int i, int *len)
+ struct seq_file *f, int i)
{
__be32 dest, src;
__u16 destp, srcp;
- long delta = tw->tw_ttd - jiffies;
+ s32 delta = tw->tw_ttd - inet_tw_time_stamp();
dest = tw->tw_daddr;
src = tw->tw_rcv_saddr;
@@ -2696,10 +2399,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
srcp = ntohs(tw->tw_sport);
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
- atomic_read(&tw->tw_refcnt), tw, len);
+ atomic_read(&tw->tw_refcnt), tw);
}
#define TMPSZ 150
@@ -2707,11 +2410,11 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
static int tcp4_seq_show(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st;
- int len;
+ struct sock *sk = v;
+ seq_setwidth(seq, TMPSZ - 1);
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-*s\n", TMPSZ - 1,
- " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode");
goto out;
@@ -2721,17 +2424,17 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
case TCP_SEQ_STATE_ESTABLISHED:
- get_tcp4_sock(v, seq, st->num, &len);
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
- break;
- case TCP_SEQ_STATE_TIME_WAIT:
- get_timewait4_sock(v, seq, st->num, &len);
+ get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
break;
}
- seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
out:
+ seq_pad(seq, '\n');
return 0;
}
@@ -2806,6 +2509,7 @@ struct proto tcp_prot = {
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
+ .sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 72f7218b03f..1e70fa8fa79 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
}
/**
@@ -314,11 +314,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
}
static struct tcp_congestion_ops tcp_lp __read_mostly = {
- .flags = TCP_CONG_RTT_STAMP,
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_lp_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.pkts_acked = tcp_lp_pkts_acked,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 559d4ae6ebf..f7a2ec3ac58 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,18 +6,6 @@
#include <linux/memcontrol.h>
#include <linux/module.h>
-static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
-{
- return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
-}
-
-static void memcg_tcp_enter_memory_pressure(struct sock *sk)
-{
- if (sk->sk_cgrp->memory_pressure)
- *sk->sk_cgrp->memory_pressure = 1;
-}
-EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
-
int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
/*
@@ -27,34 +15,24 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
*/
struct res_counter *res_parent = NULL;
struct cg_proto *cg_proto, *parent_cg;
- struct tcp_memcontrol *tcp;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct net *net = current->nsproxy->net_ns;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return 0;
- tcp = tcp_from_cgproto(cg_proto);
-
- tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
- tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
- tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
- tcp->tcp_memory_pressure = 0;
+ cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
+ cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
+ cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
+ cg_proto->memory_pressure = 0;
+ cg_proto->memcg = memcg;
parent_cg = tcp_prot.proto_cgroup(parent);
if (parent_cg)
- res_parent = parent_cg->memory_allocated;
+ res_parent = &parent_cg->memory_allocated;
- res_counter_init(&tcp->tcp_memory_allocated, res_parent);
- percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
-
- cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
- cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
- cg_proto->sysctl_mem = tcp->tcp_prot_mem;
- cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
- cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
- cg_proto->memcg = memcg;
+ res_counter_init(&cg_proto->memory_allocated, res_parent);
+ percpu_counter_init(&cg_proto->sockets_allocated, 0);
return 0;
}
@@ -63,23 +41,18 @@ EXPORT_SYMBOL(tcp_init_cgroup);
void tcp_destroy_cgroup(struct mem_cgroup *memcg)
{
struct cg_proto *cg_proto;
- struct tcp_memcontrol *tcp;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return;
- tcp = tcp_from_cgproto(cg_proto);
- percpu_counter_destroy(&tcp->tcp_sockets_allocated);
+ percpu_counter_destroy(&cg_proto->sockets_allocated);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
{
- struct net *net = current->nsproxy->net_ns;
- struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
- u64 old_lim;
int i;
int ret;
@@ -90,16 +63,13 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
if (val > RES_COUNTER_MAX)
val = RES_COUNTER_MAX;
- tcp = tcp_from_cgproto(cg_proto);
-
- old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
- ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
+ ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
if (ret)
return ret;
for (i = 0; i < 3; i++)
- tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
- net->ipv4.sysctl_tcp_mem[i]);
+ cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
+ sysctl_tcp_mem[i]);
if (val == RES_COUNTER_MAX)
clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
@@ -132,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
return 0;
}
-static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long long val;
int ret = 0;
- switch (cft->private) {
+ buf = strstrip(buf);
+
+ switch (of_cft(of)->private) {
case RES_LIMIT:
/* see memcontrol.c */
- ret = res_counter_memparse_write_strategy(buffer, &val);
+ ret = res_counter_memparse_write_strategy(buf, &val);
if (ret)
break;
ret = tcp_update_limit(memcg, val);
@@ -151,33 +123,29 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
ret = -EINVAL;
break;
}
- return ret;
+ return ret ?: nbytes;
}
static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
{
- struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return default_val;
- tcp = tcp_from_cgproto(cg_proto);
- return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
+ return res_counter_read_u64(&cg_proto->memory_allocated, type);
}
static u64 tcp_read_usage(struct mem_cgroup *memcg)
{
- struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
- tcp = tcp_from_cgproto(cg_proto);
- return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+ return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
}
static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -202,61 +170,33 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
return val;
}
-static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
+static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg;
- struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
- memcg = mem_cgroup_from_css(css);
+ memcg = mem_cgroup_from_css(of_css(of));
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
- return 0;
- tcp = tcp_from_cgproto(cg_proto);
+ return nbytes;
- switch (event) {
+ switch (of_cft(of)->private) {
case RES_MAX_USAGE:
- res_counter_reset_max(&tcp->tcp_memory_allocated);
+ res_counter_reset_max(&cg_proto->memory_allocated);
break;
case RES_FAILCNT:
- res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
+ res_counter_reset_failcnt(&cg_proto->memory_allocated);
break;
}
- return 0;
-}
-
-unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
-{
- struct tcp_memcontrol *tcp;
- struct cg_proto *cg_proto;
-
- cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
- if (!cg_proto)
- return 0;
-
- tcp = tcp_from_cgproto(cg_proto);
- return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-}
-
-void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
-{
- struct tcp_memcontrol *tcp;
- struct cg_proto *cg_proto;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return;
-
- tcp = tcp_from_cgproto(cg_proto);
-
- tcp->tcp_prot_mem[idx] = val;
+ return nbytes;
}
static struct cftype tcp_files[] = {
{
.name = "kmem.tcp.limit_in_bytes",
- .write_string = tcp_cgroup_write,
+ .write = tcp_cgroup_write,
.read_u64 = tcp_cgroup_read,
.private = RES_LIMIT,
},
@@ -268,13 +208,13 @@ static struct cftype tcp_files[] = {
{
.name = "kmem.tcp.failcnt",
.private = RES_FAILCNT,
- .trigger = tcp_cgroup_reset,
+ .write = tcp_cgroup_reset,
.read_u64 = tcp_cgroup_read,
},
{
.name = "kmem.tcp.max_usage_in_bytes",
.private = RES_MAX_USAGE,
- .trigger = tcp_cgroup_reset,
+ .write = tcp_cgroup_reset,
.read_u64 = tcp_cgroup_read,
},
{ } /* terminate */
@@ -282,7 +222,7 @@ static struct cftype tcp_files[] = {
static int __init tcp_memcontrol_init(void)
{
- WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
return 0;
}
__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4a22f3e715d..4fe04180598 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -22,6 +22,10 @@
int sysctl_tcp_nometrics_save __read_mostly;
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash);
+
struct tcp_fastopen_metrics {
u16 mss;
u16 syn_loss:10; /* Recurring Fast Open SYN losses */
@@ -29,14 +33,20 @@ struct tcp_fastopen_metrics {
struct tcp_fastopen_cookie cookie;
};
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
+
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
- struct inetpeer_addr tcpm_addr;
+ struct inetpeer_addr tcpm_saddr;
+ struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
u32 tcpm_ts;
u32 tcpm_ts_stamp;
u32 tcpm_lock;
- u32 tcpm_vals[TCP_METRIC_MAX + 1];
+ u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
struct tcp_fastopen_metrics tcpm_fastopen;
struct rcu_head rcu_head;
@@ -54,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,
return tm->tcpm_vals[idx];
}
-static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
- enum tcp_metric_index idx)
-{
- return msecs_to_jiffies(tm->tcpm_vals[idx]);
-}
-
static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
@@ -67,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
tm->tcpm_vals[idx] = val;
}
-static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
- enum tcp_metric_index idx,
- u32 val)
-{
- tm->tcpm_vals[idx] = jiffies_to_msecs(val);
-}
-
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
@@ -96,9 +93,11 @@ struct tcpm_hash_bucket {
static DEFINE_SPINLOCK(tcp_metrics_lock);
-static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst,
bool fastopen_clear)
{
+ u32 msval;
u32 val;
tm->tcpm_stamp = jiffies;
@@ -116,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
val |= 1 << TCP_METRIC_REORDERING;
tm->tcpm_lock = val;
- tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
- tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+ msval = dst_metric_raw(dst, RTAX_RTT);
+ tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+
+ msval = dst_metric_raw(dst, RTAX_RTTVAR);
+ tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
@@ -130,16 +132,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
}
}
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ tcpm_suck_dst(tm, dst, false);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
- struct inetpeer_addr *addr,
- unsigned int hash,
- bool reclaim)
+ struct inetpeer_addr *saddr,
+ struct inetpeer_addr *daddr,
+ unsigned int hash)
{
struct tcp_metrics_block *tm;
struct net *net;
+ bool reclaim = false;
spin_lock_bh(&tcp_metrics_lock);
net = dev_net(dst->dev);
+
+ /* While waiting for the spin-lock the cache might have been populated
+ * with this entry and so we have to check again.
+ */
+ tm = __tcp_get_metrics(saddr, daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (tm) {
+ tcpm_check_stamp(tm, dst);
+ goto out_unlock;
+ }
+
if (unlikely(reclaim)) {
struct tcp_metrics_block *oldest;
@@ -155,7 +183,8 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (!tm)
goto out_unlock;
}
- tm->tcpm_addr = *addr;
+ tm->tcpm_saddr = *saddr;
+ tm->tcpm_daddr = *daddr;
tcpm_suck_dst(tm, dst, true);
@@ -169,17 +198,6 @@ out_unlock:
return tm;
}
-#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
-
-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
-{
- if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
- tcpm_suck_dst(tm, dst, false);
-}
-
-#define TCP_METRICS_RECLAIM_DEPTH 5
-#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
-
static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
{
if (tm)
@@ -189,7 +207,8 @@ static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, in
return NULL;
}
-static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
struct net *net, unsigned int hash)
{
struct tcp_metrics_block *tm;
@@ -197,7 +216,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *a
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
- if (addr_same(&tm->tcpm_addr, addr))
+ if (addr_same(&tm->tcpm_saddr, saddr) &&
+ addr_same(&tm->tcpm_daddr, daddr))
break;
depth++;
}
@@ -208,20 +228,25 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
struct dst_entry *dst)
{
struct tcp_metrics_block *tm;
- struct inetpeer_addr addr;
+ struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct net *net;
- addr.family = req->rsk_ops->family;
- switch (addr.family) {
+ saddr.family = req->rsk_ops->family;
+ daddr.family = req->rsk_ops->family;
+ switch (daddr.family) {
case AF_INET:
- addr.addr.a4 = inet_rsk(req)->rmt_addr;
- hash = (__force unsigned int) addr.addr.a4;
+ saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
+ daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
+ hash = (__force unsigned int) daddr.addr.a4;
break;
+#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
- hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
+ *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr;
+ *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
+ hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
break;
+#endif
default:
return NULL;
}
@@ -231,7 +256,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
- if (addr_same(&tm->tcpm_addr, &addr))
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr))
break;
}
tcpm_check_stamp(tm, dst);
@@ -240,33 +266,45 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
{
- struct inet6_timewait_sock *tw6;
struct tcp_metrics_block *tm;
- struct inetpeer_addr addr;
+ struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct net *net;
- addr.family = tw->tw_family;
- switch (addr.family) {
- case AF_INET:
- addr.addr.a4 = tw->tw_daddr;
- hash = (__force unsigned int) addr.addr.a4;
- break;
- case AF_INET6:
- tw6 = inet6_twsk((struct sock *)tw);
- *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
- hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
- break;
- default:
- return NULL;
+ if (tw->tw_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr;
+ hash = ipv6_addr_hash(&tw->tw_v6_daddr);
+ }
}
+#endif
+ else
+ return NULL;
net = twsk_net(tw);
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
- if (addr_same(&tm->tcpm_addr, &addr))
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr))
break;
}
return tm;
@@ -277,36 +315,45 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
bool create)
{
struct tcp_metrics_block *tm;
- struct inetpeer_addr addr;
+ struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct net *net;
- bool reclaim;
- addr.family = sk->sk_family;
- switch (addr.family) {
- case AF_INET:
- addr.addr.a4 = inet_sk(sk)->inet_daddr;
- hash = (__force unsigned int) addr.addr.a4;
- break;
- case AF_INET6:
- *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
- hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
- break;
- default:
- return NULL;
+ if (sk->sk_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr;
+ hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+ }
}
+#endif
+ else
+ return NULL;
net = dev_net(dst->dev);
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
- tm = __tcp_get_metrics(&addr, net, hash);
- reclaim = false;
- if (tm == TCP_METRICS_RECLAIM_PTR) {
- reclaim = true;
+ tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR)
tm = NULL;
- }
if (!tm && create)
- tm = tcpm_new(dst, &addr, hash, reclaim);
+ tm = tcpm_new(dst, &saddr, &daddr, hash);
else
tcpm_check_stamp(tm, dst);
@@ -334,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst);
rcu_read_lock();
- if (icsk->icsk_backoff || !tp->srtt) {
+ if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why?
* Probably, no packets returned in time. Reset our
* results.
@@ -349,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)
if (!tm)
goto out_unlock;
- rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
- m = rtt - tp->srtt;
+ rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt_us;
/* If newly calculated rtt larger than stored one, store new
* one. Otherwise, use EWMA. Remember, rtt overestimation is
@@ -358,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)
*/
if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
if (m <= 0)
- rtt = tp->srtt;
+ rtt = tp->srtt_us;
else
rtt -= (m >> 3);
- tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+ tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
}
if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
@@ -372,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)
/* Scale deviation to rttvar fixed point */
m >>= 1;
- if (m < tp->mdev)
- m = tp->mdev;
+ if (m < tp->mdev_us)
+ m = tp->mdev_us;
- var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+ var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
if (m >= var)
var = m;
else
var -= (var - m) >> 2;
- tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
}
if (tcp_in_initial_slowstart(tp)) {
@@ -478,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)
tp->reordering = val;
}
- crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+ crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock();
reset:
/* The initial RTT measurement from the SYN/SYN-ACK is not ideal
@@ -501,16 +548,20 @@ reset:
* to low value, and then abruptly stops to do it and starts to delay
* ACKs, wait for troubles.
*/
- if (crtt > tp->srtt) {
- inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
- } else if (tp->srtt == 0) {
+ if (crtt > tp->srtt_us) {
+ /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+ crtt /= 8 * USEC_PER_MSEC;
+ inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
+ } else if (tp->srtt_us == 0) {
/* RFC6298: 5.7 We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs
* from the more aggressive 1sec to avoid more spurious
* retransmission.
*/
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+ tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
+
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
}
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
@@ -657,16 +708,20 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie, bool syn_lost)
{
+ struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_metrics_block *tm;
+ if (!dst)
+ return;
rcu_read_lock();
- tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+ tm = tcp_get_metrics(sk, dst, true);
if (tm) {
struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
write_seqlock_bh(&fastopen_seqlock);
- tfom->mss = mss;
- if (cookie->len > 0)
+ if (mss)
+ tfom->mss = mss;
+ if (cookie && cookie->len > 0)
tfom->cookie = *cookie;
if (syn_lost) {
++tfom->syn_loss;
@@ -714,15 +769,21 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
struct nlattr *nest;
int i;
- switch (tm->tcpm_addr.family) {
+ switch (tm->tcpm_daddr.family) {
case AF_INET:
if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
- tm->tcpm_addr.addr.a4) < 0)
+ tm->tcpm_daddr.addr.a4) < 0)
+ goto nla_put_failure;
+ if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+ tm->tcpm_saddr.addr.a4) < 0)
goto nla_put_failure;
break;
case AF_INET6:
if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
- tm->tcpm_addr.addr.a6) < 0)
+ tm->tcpm_daddr.addr.a6) < 0)
+ goto nla_put_failure;
+ if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16,
+ tm->tcpm_saddr.addr.a6) < 0)
goto nla_put_failure;
break;
default:
@@ -747,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
if (!nest)
goto nla_put_failure;
- for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
- if (!tm->tcpm_vals[i])
+ for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
+ u32 val = tm->tcpm_vals[i];
+
+ if (!val)
continue;
- if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
+ if (i == TCP_METRIC_RTT) {
+ if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (i == TCP_METRIC_RTTVAR) {
+ if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (nla_put_u32(msg, i + 1, val) < 0)
goto nla_put_failure;
n++;
}
@@ -845,44 +922,66 @@ done:
return skb->len;
}
-static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
- unsigned int *hash, int optional)
+static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional, int v4, int v6)
{
struct nlattr *a;
- a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
+ a = info->attrs[v4];
if (a) {
addr->family = AF_INET;
addr->addr.a4 = nla_get_be32(a);
- *hash = (__force unsigned int) addr->addr.a4;
+ if (hash)
+ *hash = (__force unsigned int) addr->addr.a4;
return 0;
}
- a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
+ a = info->attrs[v6];
if (a) {
if (nla_len(a) != sizeof(struct in6_addr))
return -EINVAL;
addr->family = AF_INET6;
memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
- *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
+ if (hash)
+ *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
return 0;
}
return optional ? 1 : -EAFNOSUPPORT;
}
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional)
+{
+ return __parse_nl_addr(info, addr, hash, optional,
+ TCP_METRICS_ATTR_ADDR_IPV4,
+ TCP_METRICS_ATTR_ADDR_IPV6);
+}
+
+static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
+{
+ return __parse_nl_addr(info, addr, NULL, 0,
+ TCP_METRICS_ATTR_SADDR_IPV4,
+ TCP_METRICS_ATTR_SADDR_IPV6);
+}
+
static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct tcp_metrics_block *tm;
- struct inetpeer_addr addr;
+ struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct sk_buff *msg;
struct net *net = genl_info_net(info);
void *reply;
int ret;
+ bool src = true;
- ret = parse_nl_addr(info, &addr, &hash, 0);
+ ret = parse_nl_addr(info, &daddr, &hash, 0);
if (ret < 0)
return ret;
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
@@ -897,7 +996,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
rcu_read_lock();
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
- if (addr_same(&tm->tcpm_addr, &addr)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
ret = tcp_metrics_fill_info(msg, tm);
break;
}
@@ -952,36 +1052,42 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct tcpm_hash_bucket *hb;
struct tcp_metrics_block *tm;
struct tcp_metrics_block __rcu **pp;
- struct inetpeer_addr addr;
+ struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct net *net = genl_info_net(info);
int ret;
+ bool src = true, found = false;
- ret = parse_nl_addr(info, &addr, &hash, 1);
+ ret = parse_nl_addr(info, &daddr, &hash, 1);
if (ret < 0)
return ret;
if (ret > 0)
return tcp_metrics_flush_all(net);
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
hb = net->ipv4.tcp_metrics_hash + hash;
pp = &hb->chain;
spin_lock_bh(&tcp_metrics_lock);
- for (tm = deref_locked_genl(*pp); tm;
- pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
- if (addr_same(&tm->tcpm_addr, &addr)) {
+ for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
*pp = tm->tcpm_next;
- break;
+ kfree_rcu(tm, rcu_head);
+ found = true;
+ } else {
+ pp = &tm->tcpm_next;
}
}
spin_unlock_bh(&tcp_metrics_lock);
- if (!tm)
+ if (!found)
return -ESRCH;
- kfree_rcu(tm, rcu_head);
return 0;
}
-static struct genl_ops tcp_metrics_nl_ops[] = {
+static const struct genl_ops tcp_metrics_nl_ops[] = {
{
.cmd = TCP_METRICS_CMD_GET,
.doit = tcp_metrics_nl_cmd_get,
@@ -1053,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net)
tm = next;
}
}
- if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
- vfree(net->ipv4.tcp_metrics_hash);
- else
- kfree(net->ipv4.tcp_metrics_hash);
+ kvfree(net->ipv4.tcp_metrics_hash);
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
@@ -1072,8 +1175,7 @@ void __init tcp_metrics_init(void)
if (ret < 0)
goto cleanup;
ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
- tcp_metrics_nl_ops,
- ARRAY_SIZE(tcp_metrics_nl_ops));
+ tcp_metrics_nl_ops);
if (ret < 0)
goto cleanup_subsys;
return;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 58a3e69aef6..e68e0d4af6c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -293,13 +293,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- tw6->tw_v6_daddr = np->daddr;
- tw6->tw_v6_rcv_saddr = np->rcv_saddr;
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
+ tw->tw_flowlabel = np->flow_label >> 12;
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -364,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+void tcp_openreq_init_rwin(struct request_sock *req,
+ struct sock *sk, struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+ int mss = dst_metric_advmss(dst);
+
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ /* Set this up on the first call only */
+ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+ ireq->rcv_wscale = rcv_wscale;
+}
+EXPORT_SYMBOL(tcp_openreq_init_rwin);
+
static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
struct request_sock *req)
{
@@ -400,8 +429,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_init_wl(newtp, treq->rcv_isn);
- newtp->srtt = 0;
- newtp->mdev = TCP_TIMEOUT_INIT;
+ newtp->srtt_us = 0;
+ newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -428,7 +457,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
- skb_queue_head_init(&newtp->out_of_order_queue);
+ __skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
@@ -747,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
skb->len);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 3a7525e6c08..55046ecd083 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,10 +14,11 @@
#include <net/tcp.h>
#include <net/protocol.h>
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
@@ -56,8 +57,12 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
SKB_GSO_MPLS |
SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
@@ -94,21 +99,13 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
th->check = newcheck;
if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check =
- csum_fold(csum_partial(skb_transport_header(skb),
- thlen, skb->csum));
+ th->check = gso_make_checksum(skb, ~th->check);
seq += mss;
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
- /* {tcp|sock}_wfree() use exact truesize accounting :
- * sum(skb->truesize) MUST be exactly be gso_skb->truesize
- * So we account mss bytes of 'true size' for each segment.
- * The last segment will contain the remaining.
- */
- skb->truesize = mss;
- gso_skb->truesize -= mss;
+ sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);
@@ -125,7 +122,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
- swap(gso_skb->truesize, skb->truesize);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
@@ -134,12 +133,10 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check = csum_fold(csum_partial(skb_transport_header(skb),
- thlen, skb->csum));
+ th->check = gso_make_checksum(skb, ~th->check);
out:
return segs;
}
-EXPORT_SYMBOL(tcp_tso_segment);
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
@@ -198,7 +195,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
goto out_check_final;
found:
- flush = NAPI_GRO_CB(p)->flush;
+ /* Include the IP ID check below from the inner most IP hdr */
+ flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -231,17 +229,16 @@ out_check_final:
pp = head;
out:
- NAPI_GRO_CB(skb)->flush |= flush;
+ NAPI_GRO_CB(skb)->flush |= (flush != 0);
return pp;
}
-EXPORT_SYMBOL(tcp_gro_receive);
int tcp_gro_complete(struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
- skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_start = (unsigned char *)th - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check);
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -273,46 +270,46 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb)
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
+ /* Use the IP hdr immediately proceeding for this transport */
const struct iphdr *iph = skb_gro_network_header(skb);
__wsum wsum;
- __sum16 sum;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip_csum;
+
+ wsum = NAPI_GRO_CB(skb)->csum;
switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
+ 0);
+
+ /* fall through */
+
case CHECKSUM_COMPLETE:
if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
- skb->csum)) {
+ wsum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
-flush:
+
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
-
- case CHECKSUM_NONE:
- wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb_gro_len(skb), IPPROTO_TCP, 0);
- sum = csum_fold(skb_checksum(skb,
- skb_gro_offset(skb),
- skb_gro_len(skb),
- wsum));
- if (sum)
- goto flush;
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
}
+skip_csum:
return tcp_gro_receive(head, skb);
}
-static int tcp4_gro_complete(struct sk_buff *skb)
+static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
- th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
- iph->saddr, iph->daddr, 0);
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+ th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+ iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
return tcp_gro_complete(skb);
}
@@ -320,7 +317,7 @@ static int tcp4_gro_complete(struct sk_buff *skb)
static const struct net_offload tcpv4_offload = {
.callbacks = {
.gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
+ .gso_segment = tcp_gso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
},
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7c83cb8bf13..179b51e6bda 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
tcp_rearm_rto(sk);
}
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+ tcp_skb_pcount(skb));
}
/* SND.NXT, if window was not shrunk.
@@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 old_win = tp->rcv_wnd;
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
@@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)
*
* Relax Will Robinson.
*/
+ if (new_win == 0)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPWANTZEROWINDOWADV);
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
}
tp->rcv_wnd = new_win;
@@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk)
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
- if (new_win == 0)
+ if (new_win == 0) {
tp->pred_flags = 0;
+ if (old_win)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTOZEROWINDOWADV);
+ } else if (old_win == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+ }
return new_win;
}
@@ -363,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
TCP_SKB_CB(skb)->tcp_flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ shinfo->gso_segs = 1;
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -406,7 +421,7 @@ struct tcp_out_options {
* Beware: Something in the Internet is very sensitive to the ordering of
* TCP options, we learned this through the hard way, so be careful here.
* Luckily we can at least blame others for their non-compliance but from
- * inter-operatibility perspective it seems that we're somewhat stuck with
+ * inter-operability perspective it seems that we're somewhat stuck with
* the ordering which we have been using if we want to keep working with
* those broken things (not that it currently hurts anybody as there isn't
* particular reason why the ordering would need to be changed).
@@ -612,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
if (unlikely(!ireq->tstamp_ok))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- if (foc != NULL) {
+ if (foc != NULL && foc->len >= 0) {
u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
@@ -637,6 +652,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
unsigned int size = 0;
unsigned int eff_sacks;
+ opts->options = 0;
+
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (unlikely(*md5)) {
@@ -677,7 +694,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
*
* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
* needs to be reallocated in a driver.
- * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
*
* Since transmit from skb destructor is forbidden, we use a tasklet
* to process all sockets that eventually need to send more skbs.
@@ -694,12 +711,13 @@ static void tcp_tsq_handler(struct sock *sk)
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+ tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ 0, GFP_ATOMIC);
}
/*
- * One tasklest per cpu tries to send more skbs.
+ * One tasklet per cpu tries to send more skbs.
* We run in tasklet context but need to disable irqs when
- * transfering tsq->head because tcp_wfree() might
+ * transferring tsq->head because tcp_wfree() might
* interrupt us (non NAPI drivers)
*/
static void tcp_tasklet_func(unsigned long data)
@@ -762,6 +780,17 @@ void tcp_release_cb(struct sock *sk)
if (flags & (1UL << TCP_TSQ_DEFERRED))
tcp_tsq_handler(sk);
+ /* Here begins the tricky part :
+ * We are called from release_sock() with :
+ * 1) BH disabled
+ * 2) sk_lock.slock spinlock held
+ * 3) socket owned by us (sk->sk_lock.owned == 1)
+ *
+ * But following code is meant to be called from BH handlers,
+ * so we should keep BH disabled, but early release socket ownership
+ */
+ sock_release_ownership(sk);
+
if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
tcp_write_timer_handler(sk);
__sock_put(sk);
@@ -793,7 +822,7 @@ void __init tcp_tasklet_init(void)
/*
* Write buffer destructor automatically called from kfree_skb.
- * We cant xmit new skbs from this context, as we might already
+ * We can't xmit new skbs from this context, as we might already
* hold qdisc lock.
*/
void tcp_wfree(struct sk_buff *skb)
@@ -848,19 +877,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
BUG_ON(!skb || !tcp_skb_pcount(skb));
- /* If congestion control is doing timestamping, we must
- * take such a timestamp before we potentially clone/copy.
- */
- if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
- __net_timestamp(skb);
-
- if (likely(clone_it)) {
- const struct sk_buff *fclone = skb + 1;
-
- if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
- fclone->fclone == SKB_FCLONE_CLONE))
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ if (clone_it) {
+ skb_mstamp_get(&skb->skb_mstamp);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -868,6 +886,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
+ /* Our usage of tstamp should remain private */
+ skb->tstamp.tv64 = 0;
}
inet = inet_sk(sk);
@@ -895,8 +915,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
- skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
- tcp_wfree : sock_wfree;
+ skb->destructor = tcp_wfree;
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
@@ -955,7 +974,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
- err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (likely(err <= 0))
return err;
@@ -985,18 +1004,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
- if (skb->len <= mss_now || !sk_can_gso(sk) ||
- skb->ip_summed == CHECKSUM_NONE) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
+ if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ shinfo->gso_segs = 1;
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
} else {
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
- skb_shinfo(skb)->gso_size = mss_now;
- skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+ shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ shinfo->gso_size = mss_now;
+ shinfo->gso_type = sk->sk_gso_type;
}
}
@@ -1051,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
* Remember, these are still headerless SKBs at this point.
*/
int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
- unsigned int mss_now)
+ unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
@@ -1066,13 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;
- if (skb_cloned(skb) &&
- skb_is_nonlinear(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+ buff = sk_stream_alloc_skb(sk, nsize, gfp);
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
@@ -1145,6 +1166,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
*/
static void __pskb_trim_head(struct sk_buff *skb, int len)
{
+ struct skb_shared_info *shinfo;
int i, k, eat;
eat = min_t(int, len, skb_headlen(skb));
@@ -1156,23 +1178,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
}
eat = len;
k = 0;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ shinfo = skb_shinfo(skb);
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ int size = skb_frag_size(&shinfo->frags[i]);
if (size <= eat) {
skb_frag_unref(skb, i);
eat -= size;
} else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+ shinfo->frags[k].page_offset += eat;
+ skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
k++;
}
}
- skb_shinfo(skb)->nr_frags = k;
+ shinfo->nr_frags = k;
skb_reset_tail_pointer(skb);
skb->data_len -= len;
@@ -1357,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)
return mss_now;
}
-/* Congestion window validation. (RFC2861) */
-static void tcp_cwnd_validate(struct sock *sk)
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+static void tcp_cwnd_application_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ /* Limited by application or receiver window. */
+ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ u32 win_used = max(tp->snd_cwnd_used, init_win);
+ if (win_used < tp->snd_cwnd) {
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ }
+ tp->snd_cwnd_used = 0;
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->packets_out >= tp->snd_cwnd) {
+ /* Track the maximum number of outstanding packets in each
+ * window, and remember whether we were cwnd-limited then.
+ */
+ if (!before(tp->snd_una, tp->max_packets_seq) ||
+ tp->packets_out > tp->max_packets_out) {
+ tp->max_packets_out = tp->packets_out;
+ tp->max_packets_seq = tp->snd_nxt;
+ tp->is_cwnd_limited = is_cwnd_limited;
+ }
+
+ if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1377,23 +1431,51 @@ static void tcp_cwnd_validate(struct sock *sk)
}
}
-/* Returns the portion of skb which can be sent right away without
- * introducing MSS oddities to segment boundaries. In rare cases where
- * mss_now != mss_cache, we will request caller to create a small skb
- * per input skb which could be mostly avoided here (if desired).
- *
- * We explicitly want to create a request for splitting write queue tail
- * to a small skb for Nagle purposes while avoiding unnecessary modulos,
- * thus all the complexity (cwnd_len is always MSS multiple which we
- * return whenever allowed by the other factors). Basically we need the
- * modulo only when the receiver window alone is the limiting factor or
- * when we would be allowed to send the split-due-to-Nagle skb fully.
+/* Minshall's variant of the Nagle send check. */
+static bool tcp_minshall_check(const struct tcp_sock *tp)
+{
+ return after(tp->snd_sml, tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Update snd_sml if this skb is under mss
+ * Note that a TSO packet might end with a sub-mss segment
+ * The test is really :
+ * if ((skb->len % mss) != 0)
+ * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ * skb_pcount = skb->len / mss_now
+ */
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+ const struct sk_buff *skb)
+{
+ if (skb->len < tcp_skb_pcount(skb) * mss_now)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+}
+
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
*/
-static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
- unsigned int mss_now, unsigned int max_segs)
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+ int nonagle)
+{
+ return partial &&
+ ((nonagle & TCP_NAGLE_CORK) ||
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+ const struct sk_buff *skb,
+ unsigned int mss_now,
+ unsigned int max_segs,
+ int nonagle)
{
const struct tcp_sock *tp = tcp_sk(sk);
- u32 needed, window, max_len;
+ u32 partial, needed, window, max_len;
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
max_len = mss_now * max_segs;
@@ -1406,7 +1488,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
if (max_len <= needed)
return max_len;
- return needed - needed % mss_now;
+ partial = needed % mss_now;
+ /* If last segment is not a full MSS, check if Nagle rules allow us
+ * to include this last segment in this skb.
+ * Otherwise, we'll split the skb at last MSS boundary
+ */
+ if (tcp_nagle_check(partial != 0, tp, nonagle))
+ return needed - partial;
+
+ return needed;
}
/* Can at least one segment of SKB be sent right now, according to the
@@ -1446,28 +1536,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
return tso_segs;
}
-/* Minshall's variant of the Nagle send check. */
-static inline bool tcp_minshall_check(const struct tcp_sock *tp)
-{
- return after(tp->snd_sml, tp->snd_una) &&
- !after(tp->snd_sml, tp->snd_nxt);
-}
-
-/* Return false, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- * With Minshall's modification: all sent small packets are ACKed.
- */
-static inline bool tcp_nagle_check(const struct tcp_sock *tp,
- const struct sk_buff *skb,
- unsigned int mss_now, int nonagle)
-{
- return skb->len < mss_now &&
- ((nonagle & TCP_NAGLE_CORK) ||
- (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
-}
/* Return true if the Nagle test allows this packet to be
* sent now.
@@ -1488,7 +1556,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
return true;
- if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
return true;
return false;
@@ -1557,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
- return tcp_fragment(sk, skb, len, mss_now);
+ return tcp_fragment(sk, skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp);
if (unlikely(buff == NULL))
@@ -1600,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
*
* This algorithm is from John Heffner.
*/
-static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ bool *is_cwnd_limited)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1664,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
if (!tp->tso_deferred)
tp->tso_deferred = 1 | (jiffies << 1);
+ if (cong_win < send_win && cong_win < skb->len)
+ *is_cwnd_limited = true;
+
return true;
send_now:
@@ -1824,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
+ bool is_cwnd_limited = false;
sent_pkts = 0;
@@ -1840,7 +1913,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
-
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
@@ -1849,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
+ is_cwnd_limited = true;
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
@@ -1865,23 +1938,42 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
- if (!push_one && tcp_tso_should_defer(sk, skb))
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
break;
}
- /* TSQ : sk_wmem_alloc accounts skb truesize,
- * including skb overhead. But thats OK.
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
*/
- if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+ limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
+ sk->sk_pacing_rate >> 10);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- break;
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ break;
}
+
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
- sk->sk_gso_max_segs));
+ sk->sk_gso_max_segs),
+ nonagle);
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -1912,7 +2004,7 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
- tcp_cwnd_validate(sk);
+ tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -1923,7 +2015,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, tlp_time_stamp, rto_time_stamp;
- u32 rtt = tp->srtt >> 3;
+ u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
return false;
@@ -1945,7 +2037,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+ if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
return false;
@@ -1976,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
return true;
}
+/* Thanks to skb fast clones, we can detect if a prior transmit of
+ * a packet is still in a qdisc or driver queue.
+ * In this case, there is very little point doing a retransmit !
+ * Note: This is called from BH context only.
+ */
+static bool skb_still_in_host_queue(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct sk_buff *fclone = skb + 1;
+
+ if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
+ fclone->fclone == SKB_FCLONE_CLONE)) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
+ return false;
+}
+
/* When probe timeout (PTO) fires, send a new segment if one exists, else
* retransmit the last segment.
*/
@@ -2001,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk)
if (WARN_ON(!skb))
goto rearm_timer;
+ if (skb_still_in_host_queue(sk, skb))
+ goto rearm_timer;
+
pcount = tcp_skb_pcount(skb);
if (WARN_ON(!pcount))
goto rearm_timer;
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
- if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
+ if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+ GFP_ATOMIC)))
goto rearm_timer;
skb = tcp_write_queue_tail(sk);
}
@@ -2014,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk)
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
- /* Probe with zero data doesn't trigger fast recovery. */
- if (skb->len > 0)
- err = __tcp_retransmit_skb(sk, skb);
+ err = __tcp_retransmit_skb(sk, skb);
/* Record snd_nxt for loss detection. */
if (likely(!err))
@@ -2030,7 +2143,6 @@ rearm_timer:
if (likely(!err))
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSSPROBES);
- return;
}
/* Push out any pending frames which were held back due to
@@ -2128,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk)
*/
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
- int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+ int allowed_space = tcp_full_space(sk);
+ int full_space = min_t(int, tp->window_clamp, allowed_space);
int window;
if (mss > full_space)
@@ -2141,7 +2254,19 @@ u32 __tcp_select_window(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
- if (free_space < mss)
+ /* free_space might become our new window, make sure we don't
+ * increase it due to wscale.
+ */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ /* if free space is less than mss estimate, or is below 1/16th
+ * of the maximum allowed, try to move to zero-window, else
+ * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+ * new incoming data is dropped due to memory limits.
+ * With large window, mss test triggers way too late in order
+ * to announce zero window in time before rmem limit kicks in.
+ */
+ if (free_space < (allowed_space >> 4) || free_space < mss)
return 0;
}
@@ -2296,6 +2421,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int cur_mss;
+ int err;
/* Inconslusive MTU probe */
if (icsk->icsk_mtup.probe_size) {
@@ -2309,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
return -EAGAIN;
+ if (skb_still_in_host_queue(sk, skb))
+ return -EBUSY;
+
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
@@ -2331,12 +2460,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
return -EAGAIN;
if (skb->len > cur_mss) {
- if (tcp_fragment(sk, skb, cur_mss, cur_mss))
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
int oldpcount = tcp_skb_pcount(skb);
if (unlikely(oldpcount > 1)) {
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
tcp_init_tso_segs(sk, skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
@@ -2344,21 +2475,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tcp_retrans_try_collapse(sk, skb, cur_mss);
- /* Some Solaris stacks overoptimize and ignore the FIN on a
- * retransmit when old data is attached. So strip it off
- * since it is cheap to do so and saves bytes on the network.
- */
- if (skb->len > 0 &&
- (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
- tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
- if (!pskb_trim(skb, 0)) {
- /* Reuse, even though it does some unnecessary work */
- tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
- TCP_SKB_CB(skb)->tcp_flags);
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
-
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
@@ -2372,11 +2488,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
GFP_ATOMIC);
- return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
} else {
- return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
+
+ if (likely(!err)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+ /* Update global TCP statistics. */
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans++;
}
+ return err;
}
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
@@ -2385,11 +2511,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err = __tcp_retransmit_skb(sk, skb);
if (err == 0) {
- /* Update global TCP statistics. */
- TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
-
- tp->total_retrans++;
-
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
net_dbg_ratelimited("retrans_out leaked\n");
@@ -2404,15 +2525,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = TCP_SKB_CB(skb)->when;
- tp->undo_retrans += tcp_skb_pcount(skb);
-
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
*/
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
- } else {
+ } else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
+
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
return err;
}
@@ -2673,7 +2796,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
int tcp_header_size;
int mss;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2688,27 +2811,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
mss = tp->rx_opt.user_mss;
- if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
- __u8 rcv_wscale;
- /* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
-
- /* limit the window selection if the user enforce a smaller rx buffer */
- if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
- req->window_clamp = tcp_full_space(sk);
-
- /* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
- ireq->wscale_ok,
- &rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
- ireq->rcv_wscale = rcv_wscale;
- }
-
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
@@ -2727,8 +2829,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->syn = 1;
th->ack = 1;
TCP_ECN_make_synack(req, th);
- th->source = ireq->loc_port;
- th->dest = ireq->rmt_port;
+ th->source = htons(ireq->ir_num);
+ th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
*/
@@ -2743,7 +2845,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->window = htons(min(req->rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), tp, &opts);
th->doff = (tcp_header_size >> 2);
- TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
@@ -2758,7 +2860,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
-void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2880,7 +2982,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
- syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+ space = min_t(size_t, space, fo->size);
+
+ /* limit to order-0 allocations */
+ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
+
+ syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
sk->sk_allocation);
if (syn_data == NULL)
goto fallback;
@@ -2910,9 +3017,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
tcp_connect_queue_skb(sk, data);
fo->copied = data->len;
+ /* syn_data is about to be sent, we need to take current time stamps
+ * for the packets that are in write queue : SYN packet and DATA
+ */
+ skb_mstamp_get(&syn->skb_mstamp);
+ data->skb_mstamp = syn->skb_mstamp;
+
if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
tp->syn_data = (fo->copied > 0);
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
goto done;
}
syn_data = NULL;
@@ -3000,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk)
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
- if (tp->srtt) {
- int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+ if (tp->srtt_us) {
+ int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+ TCP_DELACK_MIN);
if (rtt < max_ato)
max_ato = rtt;
@@ -3099,7 +3213,6 @@ void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
- tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
tcp_xmit_probe_skb(sk, 0);
}
}
@@ -3130,7 +3243,7 @@ int tcp_write_wakeup(struct sock *sk)
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
- if (tcp_fragment(sk, skb, seg_size, mss))
+ if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(sk, skb, mss);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 611beab38a0..3b66610d415 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -38,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.1");
-static int port __read_mostly = 0;
+static int port __read_mostly;
MODULE_PARM_DESC(port, "Port to match (0=all)");
module_param(port, int, 0);
@@ -46,7 +46,7 @@ static unsigned int bufsize __read_mostly = 4096;
MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
module_param(bufsize, uint, 0);
-static unsigned int fwmark __read_mostly = 0;
+static unsigned int fwmark __read_mostly;
MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
module_param(fwmark, uint, 0);
@@ -101,22 +101,6 @@ static inline int tcp_probe_avail(void)
si4.sin_addr.s_addr = inet->inet_##mem##addr; \
} while (0) \
-#if IS_ENABLED(CONFIG_IPV6)
-#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \
- do { \
- struct ipv6_pinfo *pi6 = inet->pinet6; \
- si6.sin6_family = AF_INET6; \
- si6.sin6_port = inet->inet_##mem##port; \
- si6.sin6_addr = pi6->mem##addr; \
- si6.sin6_flowinfo = 0; /* No need here. */ \
- si6.sin6_scope_id = 0; /* No need here. */ \
- } while (0)
-#else
-#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \
- do { \
- memset(&si6, 0, sizeof(si6)); \
- } while (0)
-#endif
/*
* Hook inserted to be called before each receive packet.
@@ -147,8 +131,17 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
break;
case AF_INET6:
- tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
- tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
+ memset(&p->src.v6, 0, sizeof(p->src.v6));
+ memset(&p->dst.v6, 0, sizeof(p->dst.v6));
+#if IS_ENABLED(CONFIG_IPV6)
+ p->src.v6.sin6_family = AF_INET6;
+ p->src.v6.sin6_port = inet->inet_sport;
+ p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
+
+ p->dst.v6.sin6_family = AF_INET6;
+ p->dst.v6.sin6_port = inet->inet_dport;
+ p->dst.v6.sin6_addr = sk->sk_v6_daddr;
+#endif
break;
default:
BUG();
@@ -161,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
p->snd_wnd = tp->snd_wnd;
p->rcv_wnd = tp->rcv_wnd;
p->ssthresh = tcp_current_ssthresh(sk);
- p->srtt = tp->srtt >> 3;
+ p->srtt = tp->srtt_us >> 3;
tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 8ce55b8aaec..8250949b885 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,15 +15,15 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
@@ -38,7 +38,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
.cong_avoid = tcp_scalable_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.owner = THIS_MODULE,
.name = "scalable",
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4b85e6f636c..286227abed1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,12 +156,19 @@ static bool retransmits_timed_out(struct sock *sk,
static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
int retry_until;
bool do_reset, syn_set = false;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (icsk->icsk_retransmits)
+ if (icsk->icsk_retransmits) {
dst_negative_advice(sk);
+ if (tp->syn_fastopen || tp->syn_data)
+ tcp_fastopen_cache_set(sk, 0, NULL, true);
+ if (tp->syn_data)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
syn_set = true;
} else {
@@ -374,9 +381,8 @@ void tcp_retransmit_timer(struct sock *sk)
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
- &np->daddr,
+ &sk->sk_v6_daddr,
ntohs(inet->inet_dport), inet->inet_num,
tp->snd_una, tp->snd_nxt);
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 80fa2bfd7ed..9a5e05f27f4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,13 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
return min(tp->snd_ssthresh, tp->snd_cwnd-1);
}
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
@@ -194,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
u32 rtt, diff;
u64 target_cwnd;
@@ -243,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
@@ -283,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
}
/* Use normal slow start */
else if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
}
@@ -305,11 +305,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
static struct tcp_congestion_ops tcp_vegas __read_mostly = {
- .flags = TCP_CONG_RTT_STAMP,
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_vegas_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 6c0eea2f824..0531b99d863 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -15,10 +15,10 @@ struct vegas {
u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
};
-extern void tcp_vegas_init(struct sock *sk);
-extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
-extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
-extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ac43cd747bc..27b9825753d 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,18 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
/* limited by applications */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* We do the Veno calculations only if we got enough rtt samples */
@@ -133,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
u64 target_cwnd;
u32 rtt;
@@ -152,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
if (veno->diff < beta) {
@@ -202,7 +202,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
}
static struct tcp_congestion_ops tcp_veno __read_mostly = {
- .flags = TCP_CONG_RTT_STAMP,
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
.cong_avoid = tcp_veno_cong_avoid,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 76a1e23259e..b94a04ae2ed 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_westwood_bw_rttmin,
.cwnd_event = tcp_westwood_event,
.get_info = tcp_westwood_info,
.pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 05c3b6f0e8e..599b79b8eac 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -3,7 +3,7 @@
* YeAH TCP
*
* For further details look at:
- * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
*
*/
#include <linux/mm.h>
@@ -15,13 +15,13 @@
#include "tcp_vegas.h"
-#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
-#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
-#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
-#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
-#define TCP_YEAH_PHY 8 //lin maximum delta from base
-#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss
-#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
+#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */
+#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */
+#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */
+#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */
+#define TCP_YEAH_PHY 8 /* maximum delta from base */
+#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */
+#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */
#define TCP_SCALABLE_AI_CNT 100U
@@ -69,16 +69,16 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else if (!yeah->doing_reno_now) {
/* Scalable */
@@ -213,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
if (yeah->doing_reno_now < TCP_YEAH_RHO) {
reduction = yeah->lastQ;
- reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
+ reduction = min(reduction, max(tp->snd_cwnd>>1, 2U));
- reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+ reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
} else
reduction = max(tp->snd_cwnd>>1, 2U);
@@ -226,11 +226,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
- .flags = TCP_CONG_RTT_STAMP,
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
.cong_avoid = tcp_yeah_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 74d2c95db57..7d5a8661df7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -103,6 +103,7 @@
#include <linux/seq_file.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
+#include <net/inet_hashtables.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -219,10 +220,10 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
- inet_get_local_port_range(&low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- rand = net_random();
+ rand = prandom_u32();
first = (((u64)rand * remaining) >> 32) + low;
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
@@ -245,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
do {
if (low <= snum && snum <= high &&
!test_bit(snum >> udptable->log, bitmap) &&
- !inet_is_reserved_local_port(snum))
+ !inet_is_local_reserved_port(net, snum))
goto found;
snum += rand;
} while (snum != first);
@@ -406,6 +407,18 @@ static inline int compute_score2(struct sock *sk, struct net *net,
return score;
}
+static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 udp_ehash_secret __read_mostly;
+
+ net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ udp_ehash_secret + net_hash_mix(net));
+}
+
/* called with read_rcu_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
@@ -429,8 +442,8 @@ begin:
badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
- hash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -510,8 +523,8 @@ begin:
badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
- hash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -547,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport,
struct udp_table *udptable)
{
- struct sock *sk;
const struct iphdr *iph = ip_hdr(skb);
- if (unlikely(sk = skb_steal_sock(skb)))
- return sk;
- else
- return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
- iph->daddr, dport, inet_iif(skb),
- udptable);
+ return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ udptable);
}
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
@@ -565,6 +574,26 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, unsigned short hnum)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(sk) ||
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ return false;
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ return false;
+ return true;
+}
+
static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
@@ -575,20 +604,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
unsigned short hnum = ntohs(loc_port);
sk_nulls_for_each_from(s, node) {
- struct inet_sock *inet = inet_sk(s);
-
- if (!net_eq(sock_net(s), net) ||
- udp_sk(s)->udp_port_hash != hnum ||
- (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
- (inet->inet_dport != rmt_port && inet->inet_dport) ||
- (inet->inet_rcv_saddr &&
- inet->inet_rcv_saddr != loc_addr) ||
- ipv6_only_sock(s) ||
- (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
- continue;
- if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
- continue;
- goto found;
+ if (__udp_is_mcast_sock(net, s,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum))
+ goto found;
}
s = NULL;
found:
@@ -658,7 +678,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
break;
case ICMP_REDIRECT:
ipv4_sk_redirect(skb, sk);
- break;
+ goto out;
}
/*
@@ -707,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
struct udphdr *uh = udp_hdr(skb);
- struct sk_buff *frags = skb_shinfo(skb)->frag_list;
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
int hlen = len;
__wsum csum = 0;
- if (!frags) {
+ if (!skb_has_frag_list(skb)) {
/*
* Only one fragment on the socket.
*/
@@ -722,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
uh->check = ~csum_tcpudp_magic(src, dst, len,
IPPROTO_UDP, 0);
} else {
+ struct sk_buff *frags;
+
/*
* HW-checksum won't work as there are two or more
* fragments on the socket so that all csums of sk_buffs
* should be together
*/
- do {
+ skb_walk_frags(skb, frags) {
csum = csum_add(csum, frags->csum);
hlen -= frags->len;
- } while ((frags = frags->next));
+ }
csum = skb_checksum(skb, offset, hlen, csum);
skb->ip_summed = CHECKSUM_NONE;
@@ -742,6 +763,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
}
EXPORT_SYMBOL_GPL(udp4_hwcsum);
+/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
+ */
+void udp_set_csum(bool nocheck, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, int len)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ else if (skb_dst(skb) && skb_dst(skb)->dev &&
+ (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ } else {
+ __wsum csum;
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, len, 0);
+ uh->check = udp_v4_check(len, saddr, daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+}
+EXPORT_SYMBOL(udp_set_csum);
+
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
{
struct sock *sk = skb->sk;
@@ -765,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
- else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
+ else if (sk->sk_no_check_tx) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
@@ -855,6 +913,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
@@ -880,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* Get and verify the address.
*/
if (msg->msg_name) {
- struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -909,7 +969,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sock_tx_timestamp(sk, &ipc.tx_flags);
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc,
+ sk->sk_family == AF_INET6);
if (err)
return err;
if (ipc.opt)
@@ -938,7 +999,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
faddr = ipc.opt->opt.faddr;
connected = 0;
}
- tos = RT_TOS(inet->tos);
+ tos = get_rttos(&ipc, inet);
if (sock_flag(sk, SOCK_LOCALROUTE) ||
(msg->msg_flags & MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->opt.is_strictroute)) {
@@ -964,7 +1025,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
fl4 = &fl4_stack;
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
+ inet_sk_flowi_flags(sk),
faddr, saddr, dport, inet->inet_sport);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
@@ -973,7 +1034,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -1072,6 +1133,9 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
struct udp_sock *up = udp_sk(sk);
int ret;
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
if (!up->pending) {
struct msghdr msg = { .msg_flags = flags|MSG_MORE };
@@ -1201,7 +1265,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
int peeked, off = 0;
@@ -1209,14 +1273,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int is_udplite = IS_UDPLITE(sk);
bool slow;
- /*
- * Check any passed addresses
- */
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len);
+ return ip_recv_error(sk, msg, len, addr_len);
try_again:
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
@@ -1276,6 +1334,7 @@ try_again:
sin->sin_port = udp_hdr(skb)->source;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
@@ -1403,8 +1462,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
- if (inet_sk(sk)->inet_daddr)
+ if (inet_sk(sk)->inet_daddr) {
sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ }
rc = sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
@@ -1472,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
int ret;
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
ret = encap_rcv(sk, skb);
if (ret <= 0) {
UDP_INC_STATS_BH(sock_net(sk),
@@ -1523,12 +1588,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto csum_error;
- if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
goto drop;
+ }
rc = 0;
- ipv4_pktinfo_prepare(skb);
+ ipv4_pktinfo_prepare(sk, skb);
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
@@ -1577,6 +1645,18 @@ static void flush_stack(struct sock **stack, unsigned int count,
kfree_skb(skb1);
}
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use xchg() to guard against concurrent changes.
+ */
+static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+ struct dst_entry *old;
+
+ dst_hold(dst);
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+}
+
/*
* Multicasts and broadcasts go to each listener.
*
@@ -1637,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
int proto)
{
- const struct iphdr *iph;
int err;
UDP_SKB_CB(skb)->partial_cov = 0;
@@ -1649,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- iph = ip_hdr(skb);
- if (uh->check == 0) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- proto, skb->csum))
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
- if (!skb_csum_unnecessary(skb))
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len, proto, 0);
- /* Probably, we should checksum udp header (it should be in cache
- * in any case) and data in tiny packets (< rx copybreak).
- */
-
- return 0;
+ return skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
@@ -1705,16 +1770,33 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(net, skb, uh,
- saddr, daddr, udptable);
+ sk = skb_steal_sock(skb);
+ if (sk) {
+ struct dst_entry *dst = skb_dst(skb);
+ int ret;
+
+ if (unlikely(sk->sk_rx_dst != dst))
+ udp_sk_rx_dst_set(sk, dst);
- sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ } else {
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable);
+
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ }
if (sk != NULL) {
int ret;
- sk_mark_napi_id(sk, skb);
ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
@@ -1768,6 +1850,142 @@ drop:
return 0;
}
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ struct udp_hslot *hslot = &udp_table.hash[slot];
+
+ /* Do not bother scanning a too big list */
+ if (hslot->count > 10)
+ return NULL;
+
+ rcu_read_lock();
+begin:
+ count = 0;
+ result = NULL;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum)) {
+ result = sk;
+ ++count;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (count != 1 ||
+ unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!__udp_is_mcast_sock(net, result,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups. The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int slot2 = hash2 & udp_table.mask;
+ struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
+ const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+ rcu_read_lock();
+ result = NULL;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr, ports, dif))
+ result = sk;
+ /* Only check first socket in chain */
+ break;
+ }
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr,
+ ports, dif))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+void udp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph;
+ const struct udphdr *uh;
+ struct sock *sk;
+ struct dst_entry *dst;
+ int dif = skb->dev->ifindex;
+
+ /* validate the packet */
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST)
+ sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ else if (skb->pkt_type == PACKET_HOST)
+ sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ else
+ return;
+
+ if (!sk)
+ return;
+
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ dst = sk->sk_rx_dst;
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst)
+ skb_dst_set_noref(skb, dst);
+}
+
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
@@ -1795,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
int (*push_pending_frames)(struct sock *))
{
struct udp_sock *up = udp_sk(sk);
- int val;
+ int val, valbool;
int err = 0;
int is_udplite = IS_UDPLITE(sk);
@@ -1805,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
if (get_user(val, (int __user *)optval))
return -EFAULT;
+ valbool = val ? 1 : 0;
+
switch (optname) {
case UDP_CORK:
if (val != 0) {
@@ -1834,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
}
break;
+ case UDP_NO_CHECK6_TX:
+ up->no_check6_tx = valbool;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ up->no_check6_rx = valbool;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -1916,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->encap_type;
break;
+ case UDP_NO_CHECK6_TX:
+ val = up->no_check6_tx;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ val = up->no_check6_rx;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
@@ -2150,7 +2386,7 @@ EXPORT_SYMBOL(udp_proc_unregister);
/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
- int bucket, int *len)
+ int bucket)
{
struct inet_sock *inet = inet_sk(sp);
__be32 dest = inet->inet_daddr;
@@ -2159,7 +2395,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
@@ -2167,23 +2403,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
0, sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops), len);
+ atomic_read(&sp->sk_drops));
}
int udp4_seq_show(struct seq_file *seq, void *v)
{
+ seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-127s\n",
- " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
struct udp_iter_state *state = seq->private;
- int len;
- udp4_format_sock(v, seq, state->bucket, &len);
- seq_printf(seq, "%*s\n", 127 - len, "");
+ udp4_format_sock(v, seq, state->bucket);
}
+ seq_pad(seq, '\n');
return 0;
}
@@ -2296,11 +2531,16 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
int mac_len = skb->mac_len;
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
__be16 protocol = skb->protocol;
netdev_features_t enc_features;
- int outer_hlen;
+ int udp_offset, outer_hlen;
+ unsigned int oldlen;
+ bool need_csum;
+
+ oldlen = (u16)~skb->len;
if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
goto out;
@@ -2312,17 +2552,25 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = htons(ETH_P_TEB);
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+ if (need_csum)
+ skb->encap_hdr_csum = 1;
+
/* segment inner packet. */
enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs))
+ if (!segs || IS_ERR(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
goto out;
+ }
outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
skb = segs;
do {
struct udphdr *uh;
- int udp_offset = outer_hlen - tnl_hlen;
+ int len;
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
@@ -2333,31 +2581,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
uh = udp_hdr(skb);
- uh->len = htons(skb->len - udp_offset);
-
- /* csum segment if tunnel sets skb with csum. */
- if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
- struct iphdr *iph = ip_hdr(skb);
+ uh->len = htons(len);
- uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- skb->len - udp_offset,
- IPPROTO_UDP, 0);
- uh->check = csum_fold(skb_checksum(skb, udp_offset,
- skb->len - udp_offset, 0));
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
+ if (need_csum) {
+ __be32 delta = htonl(oldlen + len);
- } else if (protocol == htons(ETH_P_IPV6)) {
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
- u32 len = skb->len - udp_offset;
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ uh->check = gso_make_checksum(skb, ~uh->check);
- uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
- len, IPPROTO_UDP, 0);
- uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
- skb->ip_summed = CHECKSUM_NONE;
}
skb->protocol = protocol;
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 5a681e298b9..f3c27899f62 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,30 +5,30 @@
#include <net/protocol.h>
#include <net/inet_common.h>
-extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
-extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
-extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
-extern int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-extern int udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
+int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
-extern int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-extern int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
#endif
-extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len);
-extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags);
-extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-extern void udp_destroy_sock(struct sock *sk);
+int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len);
+int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
+ int flags);
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
-extern int udp4_seq_show(struct seq_file *seq, void *v);
+int udp4_seq_show(struct seq_file *seq, void *v);
#endif
#endif /* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f35eccaa855..546d2d439dd 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,6 +14,17 @@
#include <net/udp.h>
#include <net/protocol.h>
+static DEFINE_SPINLOCK(udp_offload_lock);
+static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
+
+#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
+
+struct udp_offload_priv {
+ struct udp_offload *offload;
+ struct rcu_head rcu;
+ struct udp_offload_priv __rcu *next;
+};
+
static int udp4_ufo_send_check(struct sk_buff *skb)
{
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -41,6 +52,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
+ int offset;
+ __wsum csum;
+
+ if (skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+ segs = skb_udp_tunnel_segment(skb, features);
+ goto out;
+ }
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
@@ -52,7 +72,10 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
SKB_GSO_UDP_TUNNEL |
- SKB_GSO_GRE | SKB_GSO_MPLS) ||
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
+ SKB_GSO_MPLS) ||
!(type & (SKB_GSO_UDP))))
goto out;
@@ -62,35 +85,162 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+ offset = skb_checksum_start_offset(skb);
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ offset += skb->csum_offset;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
/* Fragment the skb. IP headers of the fragments are updated in
* inet_gso_segment()
*/
- if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
- segs = skb_udp_tunnel_segment(skb, features);
- else {
- int offset;
- __wsum csum;
-
- /* Do software UFO. Complete and fill in the UDP checksum as
- * HW cannot do checksum of UDP packets sent as multiple
- * IP fragments.
- */
- offset = skb_checksum_start_offset(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
- skb->ip_summed = CHECKSUM_NONE;
-
- segs = skb_segment(skb, features);
- }
+ segs = skb_segment(skb, features);
out:
return segs;
}
+int udp_add_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
+
+ if (!new_offload)
+ return -ENOMEM;
+
+ new_offload->offload = uo;
+
+ spin_lock(&udp_offload_lock);
+ new_offload->next = udp_offload_base;
+ rcu_assign_pointer(udp_offload_base, new_offload);
+ spin_unlock(&udp_offload_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(udp_add_offload);
+
+static void udp_offload_free_routine(struct rcu_head *head)
+{
+ struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
+ kfree(ou_priv);
+}
+
+void udp_del_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv __rcu **head = &udp_offload_base;
+ struct udp_offload_priv *uo_priv;
+
+ spin_lock(&udp_offload_lock);
+
+ uo_priv = udp_deref_protected(*head);
+ for (; uo_priv != NULL;
+ uo_priv = udp_deref_protected(*head)) {
+ if (uo_priv->offload == uo) {
+ rcu_assign_pointer(*head,
+ udp_deref_protected(uo_priv->next));
+ goto unlock;
+ }
+ head = &uo_priv->next;
+ }
+ pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
+unlock:
+ spin_unlock(&udp_offload_lock);
+ if (uo_priv != NULL)
+ call_rcu(&uo_priv->rcu, udp_offload_free_routine);
+}
+EXPORT_SYMBOL(udp_del_offload);
+
+static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct udp_offload_priv *uo_priv;
+ struct sk_buff *p, **pp = NULL;
+ struct udphdr *uh, *uh2;
+ unsigned int hlen, off;
+ int flush = 1;
+
+ if (NAPI_GRO_CB(skb)->udp_mark ||
+ (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE))
+ goto out;
+
+ /* mark that this skb passed once through the udp gro layer */
+ NAPI_GRO_CB(skb)->udp_mark = 1;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*uh);
+ uh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ uh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!uh))
+ goto out;
+ }
+
+ rcu_read_lock();
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_receive)
+ goto unflush;
+ }
+ goto out_unlock;
+
+unflush:
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = (struct udphdr *)(p->data + off);
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+ pp = uo_priv->offload->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+ return pp;
+}
+
+static int udp_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct udp_offload_priv *uo_priv;
+ __be16 newlen = htons(skb->len - nhoff);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ int err = -ENOSYS;
+
+ uh->len = newlen;
+
+ rcu_read_lock();
+
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_complete)
+ break;
+ }
+
+ if (uo_priv != NULL)
+ err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+
+ rcu_read_unlock();
+ return err;
+}
+
static const struct net_offload udpv4_offload = {
.callbacks = {
.gso_send_check = udp4_ufo_send_check,
.gso_segment = udp4_ufo_fragment,
+ .gro_receive = udp_gro_receive,
+ .gro_complete = udp_gro_complete,
},
};
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2c46acd4cc3..3b3efbda48e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = {
.protocol = IPPROTO_UDPLITE,
.prot = &udplite_prot,
.ops = &inet_dgram_ops,
- .no_check = 0, /* must checksum (RFC 3828) */
.flags = INET_PROTOSW_PERMANENT,
};
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 1f12c8b4586..aac6197b7a7 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -37,15 +37,6 @@ drop:
return NET_RX_DROP;
}
-int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
- int encap_type)
-{
- XFRM_SPI_SKB_CB(skb)->family = AF_INET;
- XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
- return xfrm_input(skb, nexthdr, spi, encap_type);
-}
-EXPORT_SYMBOL(xfrm4_rcv_encap);
-
int xfrm4_transport_finish(struct sk_buff *skb, int async)
{
struct iphdr *iph = ip_hdr(skb);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e3db3f91511..71acd0014f2 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
skb_set_network_header(skb, -x->props.header_len -
- hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
+ hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
if (x->sel.family != AF_INET6)
skb->network_header += IPV4_BEET_PHMAXLEN;
skb->mac_header = skb->network_header +
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index eb1dd4d643f..91771a7c802 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,65 +15,6 @@
#include <net/ip.h>
#include <net/xfrm.h>
-/* Informational hook. The decap is still done here. */
-static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
-static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
-
-int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
-{
- struct xfrm_tunnel __rcu **pprev;
- struct xfrm_tunnel *t;
- int ret = -EEXIST;
- int priority = handler->priority;
-
- mutex_lock(&xfrm4_mode_tunnel_input_mutex);
-
- for (pprev = &rcv_notify_handlers;
- (t = rcu_dereference_protected(*pprev,
- lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
- pprev = &t->next) {
- if (t->priority > priority)
- break;
- if (t->priority == priority)
- goto err;
-
- }
-
- handler->next = *pprev;
- rcu_assign_pointer(*pprev, handler);
-
- ret = 0;
-
-err:
- mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
-
-int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
-{
- struct xfrm_tunnel __rcu **pprev;
- struct xfrm_tunnel *t;
- int ret = -ENOENT;
-
- mutex_lock(&xfrm4_mode_tunnel_input_mutex);
- for (pprev = &rcv_notify_handlers;
- (t = rcu_dereference_protected(*pprev,
- lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
- pprev = &t->next) {
- if (t == handler) {
- *pprev = handler->next;
- ret = 0;
- break;
- }
- }
- mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
- synchronize_net();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
-
static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
{
struct iphdr *inner_iph = ipip_hdr(skb);
@@ -117,24 +58,18 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
- ip_select_ident(top_iph, dst->child, NULL);
top_iph->ttl = ip4_dst_hoplimit(dst->child);
top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;
+ ip_select_ident(skb, NULL);
return 0;
}
-#define for_each_input_rcu(head, handler) \
- for (handler = rcu_dereference(head); \
- handler != NULL; \
- handler = rcu_dereference(handler->next))
-
static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct xfrm_tunnel *handler;
int err = -EINVAL;
if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out;
- for_each_input_rcu(rcv_notify_handlers, handler)
- handler->handler(skb);
-
err = skb_unclone(skb, GFP_ATOMIC);
if (err)
goto out;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index baa0f63731f..d5f6bd9a210 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
goto out;
- if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
goto out;
mtu = dst_mtu(skb_dst(skb));
@@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
if (err)
return err;
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED;
-
- skb->protocol = htons(ETH_P_IP);
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
return x->outer_mode->output2(x, skb);
}
@@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output);
int xfrm4_output_finish(struct sk_buff *skb)
{
-#ifdef CONFIG_NETFILTER
- if (!skb_dst(skb)->xfrm) {
- IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(skb);
- }
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ skb->protocol = htons(ETH_P_IP);
+#ifdef CONFIG_NETFILTER
IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
#endif
- skb->protocol = htons(ETH_P_IP);
return xfrm_output(skb);
}
-int xfrm4_output(struct sk_buff *skb)
+static int __xfrm4_output(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
- struct xfrm_state *x = dst->xfrm;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+#ifdef CONFIG_NETFILTER
+ if (!x) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output(skb);
+ }
+#endif
+ return x->outer_mode->afinfo->output_finish(skb);
+}
+
+int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
- NULL, dst->dev,
- x->outer_mode->afinfo->output_finish,
+ NULL, skb_dst(skb)->dev, __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9a459be24af..6156f68a1e9 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -104,9 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
const struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
@@ -235,7 +240,7 @@ static struct dst_ops xfrm4_dst_ops = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 1024,
+ .gc_thresh = 32768,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -320,6 +325,7 @@ void __init xfrm4_init(void)
xfrm4_state_init();
xfrm4_policy_init();
+ xfrm4_protocol_init();
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm4_net_ops);
#endif
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 00000000000..a2ce0101eaa
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,301 @@
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_handlers;
+ case IPPROTO_AH:
+ return &ah4_handlers;
+ case IPPROTO_COMP:
+ return &ipcomp4_handlers;
+ }
+
+ return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
+
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->cb_handler(skb, err)) <= 0)
+ return ret;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_cb);
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ if (!head)
+ goto out;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static const struct net_protocol esp4_protocol = {
+ .handler = xfrm4_esp_rcv,
+ .err_handler = xfrm4_esp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ah4_protocol = {
+ .handler = xfrm4_ah_rcv,
+ .err_handler = xfrm4_ah_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_ipcomp_rcv,
+ .err_handler = xfrm4_ipcomp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+ .family = AF_INET,
+ .owner = THIS_MODULE,
+ .callback = xfrm4_rcv_cb,
+};
+
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_protocol;
+ case IPPROTO_AH:
+ return &ah4_protocol;
+ case IPPROTO_COMP:
+ return &ipcomp4_protocol;
+ }
+
+ return NULL;
+}
+
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ bool add_netproto = false;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex)))
+ add_netproto = true;
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority < priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ if (add_netproto) {
+ if (inet_add_protocol(netproto(protocol), protocol)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ int ret = -ENOENT;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex))) {
+ if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+ pr_err("%s: can't remove protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+ xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
+EXPORT_SYMBOL(xfrm4_protocol_init);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 0b2a0641526..542074c00c7 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -16,7 +16,7 @@
static int xfrm4_init_flags(struct xfrm_state *x)
{
- if (ipv4_config.no_pmtu_disc)
+ if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
x->props.flags |= XFRM_STATE_NOPMTUDISC;
return 0;
}