diff options
Diffstat (limited to 'net/ipv4')
94 files changed, 3204 insertions, 2288 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4b81e91c80f..f032688d20d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o ping.o ip_tunnel_core.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o @@ -19,7 +19,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o -gre-y := gre_demux.o gre_offload.o +gre-y := gre_demux.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o @@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ - xfrm4_output.o + xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70011e029ac..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,9 +126,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); - /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -257,7 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; @@ -315,7 +311,6 @@ lookup_protocol: sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -327,7 +322,6 @@ lookup_protocol: goto out; err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; @@ -342,7 +336,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -1005,7 +999,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, @@ -1015,7 +1008,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -1024,7 +1016,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }, @@ -1033,7 +1024,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; @@ -1133,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, false); + inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -1264,10 +1254,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_MPLS | 0))) goto out; @@ -1299,8 +1291,11 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - /* Note : following gso_segment() might change skb->encapsulation */ - udpfrag = !skb->encapsulation && proto == IPPROTO_UDP; + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + udpfrag = proto == IPPROTO_UDP && encap; + else + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) @@ -1377,8 +1372,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ip_hdr(p); - + iph2 = (struct iphdr *)(p->data + off); + /* The above works because, with the exception of the top + * (inner most) layer, we only aggregate pkts with the same + * hdr length so all the hdrs we'll need to verify will start + * at the same offset. + */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { @@ -1390,13 +1389,24 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | - (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | - ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); + ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); + /* Save the IP ID check to be included later when we get to + * the transport layer so only the inner most IP ID is checked. + * This is because some GSO/TSO implementations do not + * correctly increment the IP ID for the outer hdrs. + */ + NAPI_GRO_CB(p)->flush_id = + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } NAPI_GRO_CB(skb)->flush |= flush; + skb_set_network_header(skb, off); + /* The above will be needed by the transport layer if there is one + * immediately following this IP hdr. + */ + skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1411,14 +1421,17 @@ out: return pp; } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - skb_network_offset(skb)); - struct iphdr *iph = ip_hdr(skb); + __be16 newlen = htons(skb->len - nhoff); + struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; int proto = iph->protocol; int err = -ENOSYS; + if (skb->encapsulation) + skb_set_inner_network_header(skb, nhoff); + csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; @@ -1427,7 +1440,11 @@ static int inet_gro_complete(struct sk_buff *skb) if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb); + /* Only need to add sizeof(*iph) to get to the next hdr below + * because any hdr with option will have been flushed in + * inet_gro_receive(). + */ + err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); @@ -1457,22 +1474,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; - int i, j; + int i; - for_each_possible_cpu(i) { - for (j = 0; j < SNMP_ARRAY_SZ; j++) - res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); - } + for_each_possible_cpu(i) + res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; @@ -1483,12 +1498,12 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) u64 v; unsigned int start; - bhptr = per_cpu_ptr(mib[0], cpu); + bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { - start = u64_stats_fetch_begin_bh(syncp); + start = u64_stats_fetch_begin_irq(syncp); v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_bh(syncp, start)); + } while (u64_stats_fetch_retry_irq(syncp, start)); res += v; } @@ -1497,25 +1512,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ - BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, align); - if (!ptr[0]) - return -ENOMEM; - -#if SNMP_ARRAY_SZ == 2 - ptr[1] = __alloc_percpu(mibsize, align); - if (!ptr[1]) { - free_percpu(ptr[0]); - ptr[0] = NULL; - return -ENOMEM; - } -#endif - return 0; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1529,6 +1525,7 @@ static const struct net_protocol tcp_protocol = { .err_handler = tcp_v4_err, .no_policy = 1, .netns_ok = 1, + .icmp_strict_tag_validation = 1, }; static const struct net_protocol udp_protocol = { @@ -1550,40 +1547,30 @@ static __net_init int ipv4_mib_init_net(struct net *net) { int i; - if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) + net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); + if (!net->mib.tcp_statistics) goto err_tcp_mib; - if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ip_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet_stats; - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i); - u64_stats_init(&af_inet_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i); + af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); u64_stats_init(&af_inet_stats->syncp); -#endif } - if (snmp_mib_init((void __percpu **)net->mib.net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) + net->mib.net_statistics = alloc_percpu(struct linux_mib); + if (!net->mib.net_statistics) goto err_net_mib; - if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udp_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udp_statistics) goto err_udp_mib; - if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udplite_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_statistics) goto err_udplite_mib; - if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) + net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); + if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); @@ -1594,17 +1581,17 @@ static __net_init int ipv4_mib_init_net(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + free_percpu(net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + free_percpu(net->mib.udplite_statistics); err_udplite_mib: - snmp_mib_free((void __percpu **)net->mib.udp_statistics); + free_percpu(net->mib.udp_statistics); err_udp_mib: - snmp_mib_free((void __percpu **)net->mib.net_statistics); + free_percpu(net->mib.net_statistics); err_net_mib: - snmp_mib_free((void __percpu **)net->mib.ip_statistics); + free_percpu(net->mib.ip_statistics); err_ip_mib: - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } @@ -1612,12 +1599,12 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); - snmp_mib_free((void __percpu **)net->mib.udp_statistics); - snmp_mib_free((void __percpu **)net->mib.net_statistics); - snmp_mib_free((void __percpu **)net->mib.ip_statistics); - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.icmp_statistics); + free_percpu(net->mib.udplite_statistics); + free_percpu(net->mib.udp_statistics); + free_percpu(net->mib.net_statistics); + free_percpu(net->mib.ip_statistics); + free_percpu(net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1630,6 +1617,39 @@ static int __init init_ipv4_mibs(void) return register_pernet_subsys(&ipv4_mib_ops); } +static __net_init int inet_init_net(struct net *net) +{ + /* + * Set defaults for local port range + */ + seqlock_init(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = 32768; + net->ipv4.ip_local_ports.range[1] = 61000; + + seqlock_init(&net->ipv4.ping_group_range.lock); + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); + net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); + return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { + .init = inet_init_net, + .exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ + return register_pernet_subsys(&af_inet_ops); +} + static int ipv4_proc_init(void); /* @@ -1683,13 +1703,9 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); - sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); - if (!sysctl_local_reserved_ports) - goto out; - rc = proto_register(&tcp_prot, 1); if (rc) - goto out_free_reserved_ports; + goto out; rc = proto_register(&udp_prot, 1); if (rc) @@ -1774,6 +1790,9 @@ static int __init inet_init(void) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif + + if (init_inet_pernet_ops()) + pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); /* * Initialise per-cpu ipv4 mibs */ @@ -1796,8 +1815,6 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); -out_free_reserved_ports: - kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 717902669d2..a2afa89513a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; @@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } err = -ENOMEM; - iph = ah_alloc_tmp(ahash, nfrags, ihl); + iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; - - icv = ah_tmp_icv(ahash, iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; @@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; @@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) iph = ip_hdr(skb); ihl = ip_hdrlen(skb); - work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + + ahp->icv_trunc_len + seqhi_len); if (!work_iph) goto out; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; @@ -397,7 +428,7 @@ out: return err; } -static void ah4_err(struct sk_buff *skb, u32 info) +static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); + + return 0; } static int ah_init_state(struct xfrm_state *x) @@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x) kfree(ahp); } +static int ah4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} static const struct xfrm_type ah_type = { @@ -518,11 +555,12 @@ static const struct xfrm_type ah_type = .output = ah_output }; -static const struct net_protocol ah4_protocol = { +static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ah4_init(void) @@ -531,7 +569,7 @@ static int __init ah4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; @@ -541,7 +579,7 @@ static int __init ah4_init(void) static void __exit ah4_fini(void) { - if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f0418586394..1a9b99e0446 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -381,6 +381,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { + struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { @@ -399,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; + in_dev = NULL; break; case 4: /* Reserved */ case 5: @@ -410,7 +412,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) default: return 0; } - return !inet_confirm_addr(in_dev, sip, tip, scope); + return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) @@ -730,6 +732,7 @@ static int arp_process(struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. @@ -896,10 +899,12 @@ static int arp_process(struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ + is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && + inet_addr_type(net, sip) == RTN_UNICAST; + if (n == NULL && - (arp->ar_op == htons(ARPOP_REPLY) || - (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && - inet_addr_type(net, sip) == RTN_UNICAST) + ((arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -912,8 +917,10 @@ static int arp_process(struct sk_buff *skb) agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ - override = time_after(jiffies, n->updated + - NEIGH_VAR(n->parms, LOCKTIME)); + override = time_after(jiffies, + n->updated + + NEIGH_VAR(n->parms, LOCKTIME)) || + is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. @@ -1110,7 +1117,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -int arp_invalidate(struct net_device *dev, __be32 ip) +static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; @@ -1125,7 +1132,6 @@ int arp_invalidate(struct net_device *dev, __be32 ip) return err; } -EXPORT_SYMBOL(arp_invalidate); static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4b59b6e488d..69e77c8ff28 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1335,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1431,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1526,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 19e36376d2a..a3095fdefbe 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -53,7 +53,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, true); + inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) @@ -86,18 +86,26 @@ out: } EXPORT_SYMBOL(ip4_datagram_connect); +/* Because UDP xmit path can manipulate sk_dst_cache without holding + * socket lock, we need to use sk_dst_set() here, + * even if we own the socket lock. + */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; + struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; - if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0)) - return; - rcu_read_lock(); + + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { + rcu_read_unlock(); + return; + } inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; @@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk) inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); - if (!IS_ERR(rt)) - __sk_dst_set(sk, &rt->dst); + + dst = !IS_ERR(rt) ? &rt->dst : NULL; + sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 43065be3630..e9449376b58 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -99,13 +99,13 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, + [IFA_FLAGS] = { .type = NLA_U32 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static DEFINE_SPINLOCK(inet_addr_hash_lock); static u32 inet_addr_hash(struct net *net, __be32 addr) { @@ -118,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); - spin_unlock(&inet_addr_hash_lock); } static void inet_hash_remove(struct in_ifaddr *ifa) { - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_del_init_rcu(&ifa->hash); - spin_unlock(&inet_addr_hash_lock); } /** @@ -463,7 +461,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { - net_srandom(ifa->ifa_local); + prandom_seed((__force u32) ifa->ifa_local); ifap = last_primary; } @@ -473,7 +471,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -683,7 +681,8 @@ static void check_lifetime(struct work_struct *work) if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; - schedule_delayed_work(&check_lifetime_work, next_sched - now); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, + next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -757,7 +756,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); - ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : + ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; @@ -827,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa_existing = find_matching_ifa(ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but - * userspace alreay relies on not having to provide this. + * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); @@ -840,7 +840,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa = ifa_existing; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, + &check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } @@ -1238,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, /* * Confirm that local IP address exists using wildcards: - * - in_dev: only on this interface, 0=any interface + * - net: netns to check, cannot be NULL + * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ -__be32 inet_confirm_addr(struct in_device *in_dev, +__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; - struct net *net; - if (scope != RT_SCOPE_LINK) + if (in_dev != NULL) return confirm_addr_indev(in_dev, dst, local, scope); - net = dev_net(in_dev->dev); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); @@ -1384,6 +1384,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } @@ -1437,7 +1439,9 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ + + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) @@ -1505,6 +1509,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, preferred, valid)) goto nla_put_failure; @@ -1693,6 +1698,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (type == -1 || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_PROXY_NEIGH) + size += nla_total_size(4); return size; } @@ -1729,6 +1736,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, + IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) + goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -1766,6 +1777,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -1947,6 +1959,19 @@ static void inet_forward_change(struct net *net) } } +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ + if (cnf == net->ipv4.devconf_dflt) + return NETCONFA_IFINDEX_DEFAULT; + else if (cnf == net->ipv4.devconf_all) + return NETCONFA_IFINDEX_ALL; + else { + struct in_device *idev + = container_of(cnf, struct in_device, cnf); + return idev->dev->ifindex; + } +} + static int devinet_conf_proc(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1959,6 +1984,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; + int ifindex; set_bit(i, cnf->state); @@ -1968,23 +1994,19 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { - int ifindex; - - if (cnf == net->ipv4.devconf_dflt) - ifindex = NETCONFA_IFINDEX_DEFAULT; - else if (cnf == net->ipv4.devconf_all) - ifindex = NETCONFA_IFINDEX_ALL; - else { - struct in_device *idev = - container_of(cnf, struct in_device, - cnf); - ifindex = idev->dev->ifindex; - } + ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, ifindex, cnf); } + if (i == IPV4_DEVCONF_PROXY_ARP - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + ifindex, cnf); + } } return ret; @@ -2300,7 +2322,7 @@ void __init devinet_init(void) register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); rtnl_af_register(&inet_af_ops); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7785b28061a..360b565918c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -473,7 +473,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp4_err(struct sk_buff *skb, u32 info) +static int esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -483,23 +483,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); + + return 0; } static void esp_destroy(struct xfrm_state *x) @@ -672,6 +674,11 @@ error: return err; } +static int esp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp_type = { .description = "ESP4", @@ -685,11 +692,12 @@ static const struct xfrm_type esp_type = .output = esp_output }; -static const struct net_protocol esp4_protocol = { +static struct xfrm4_protocol esp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = esp4_rcv_cb, .err_handler = esp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init esp4_init(void) @@ -698,7 +706,7 @@ static int __init esp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp_type, AF_INET); return -EAGAIN; @@ -708,7 +716,7 @@ static int __init esp4_init(void) static void __exit esp4_fini(void) { - if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d846304b7b8..255aa9946fe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -250,7 +250,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif; + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) - return ip_rt_dump(skb, cb); + return skb->len; s_h = cb->args[0]; s_e = cb->args[1]; @@ -1047,6 +1047,8 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo } in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + return NOTIFY_DONE; switch (event) { case NETDEV_UP: diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 388d113fd28..1e4f6600b31 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -33,8 +33,6 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 523be38e37d..f2e15738534 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -104,7 +104,10 @@ errout: static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; - struct net_device *dev = result->fi->fib_dev; + struct net_device *dev = NULL; + + if (result->fi) + dev = result->fi->fib_dev; /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e63f47a4e65..b10cd43a472 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -426,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt) +static int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, + int dflt) { struct neighbour *n; int state = NUD_NONE; @@ -630,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, .flowi4_oif = nh->nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ @@ -819,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); if (!fi->fib_metrics) goto failure; } else fi->fib_metrics = (u32 *) dst_default_metrics; - fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5893e99e829..0485bf7f8f0 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_push(skb, hdr_len); + skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = tnl_flags_to_gre_flags(tpi->flags); greh->protocol = tpi->proto; @@ -84,7 +85,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, ptr--; } if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { *ptr = 0; *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, skb->len, 0)); @@ -93,28 +95,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, } EXPORT_SYMBOL_GPL(gre_build_header); -static __sum16 check_checksum(struct sk_buff *skb) -{ - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - - if (!csum) - break; - /* Fall through. */ - - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - break; - } - - return csum; -} - static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { @@ -141,7 +121,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { - if (check_checksum(skb)) { + if (skb_checksum_simple_validate(skb)) { *csum_err = true; return -EINVAL; } @@ -182,6 +162,14 @@ static int gre_cisco_rcv(struct sk_buff *skb) int i; bool csum_err = false; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + if (parse_gre_header(skb, &tpi, &csum_err) < 0) goto drop; @@ -355,14 +343,7 @@ static int __init gre_init(void) goto err_gre; } - if (gre_offload_init()) { - pr_err("can't add protocol offload\n"); - goto err_gso; - } - return 0; -err_gso: - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); err_gre: inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); err: @@ -371,8 +352,6 @@ err: static void __exit gre_exit(void) { - gre_offload_exit(); - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e5d43618846..f0bdd47bbbc 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -11,6 +11,7 @@ */ #include <linux/skbuff.h> +#include <linux/init.h> #include <net/protocol.h> #include <net/gre.h> @@ -26,8 +27,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t enc_features; - int ghl = GRE_HEADER_SECTION; + int ghl; struct gre_base_hdr *greh; + u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; __be16 protocol = skb->protocol; int tnl_hlen; @@ -40,6 +42,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP))) goto out; @@ -48,23 +51,21 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (greh->flags & GRE_KEY) - ghl += GRE_HEADER_SECTION; - if (greh->flags & GRE_SEQ) - ghl += GRE_HEADER_SECTION; - if (greh->flags & GRE_CSUM) { - ghl += GRE_HEADER_SECTION; - csum = true; - } else - csum = false; + ghl = skb_inner_network_header(skb) - skb_transport_header(skb); + if (unlikely(ghl < sizeof(*greh))) + goto out; - /* setup inner skb. */ - skb->protocol = greh->protocol; - skb->encapsulation = 0; + csum = !!(greh->flags & GRE_CSUM); + if (csum) + skb->encap_hdr_csum = 1; if (unlikely(!pskb_may_pull(skb, ghl))) goto out; + /* setup inner skb. */ + skb->protocol = greh->protocol; + skb->encapsulation = 0; + __skb_pull(skb, ghl); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); @@ -73,8 +74,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); goto out; + } skb = segs; tnl_hlen = skb_tnl_header_len(skb); @@ -94,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } } - greh = (struct gre_base_hdr *)(skb->data); + skb_reset_transport_header(skb); + + greh = (struct gre_base_hdr *) + skb_transport_header(skb); pcsum = (__be32 *)(greh + 1); *pcsum = 0; - *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } __skb_push(skb, tnl_hlen - ghl); @@ -113,19 +119,180 @@ out: return segs; } +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ + __sum16 sum; + + skb->csum = skb_checksum(skb, 0, skb->len, 0); + NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, + csum_partial(skb->data, skb_gro_offset(skb), 0)); + sum = csum_fold(NAPI_GRO_CB(skb)->csum); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { + if (unlikely(!sum) && !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } else { + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + } + + return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + const struct gre_base_hdr *greh; + unsigned int hlen, grehlen; + unsigned int off; + int flush = 1; + struct packet_offload *ptype; + __be16 type; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*greh); + greh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out; + } + + /* Only support version 0 and K (key), C (csum) flags. Note that + * although the support for the S (seq#) flag can be added easily + * for GRO, this is problematic for GSO hence can not be enabled + * here because a GRO pkt may end up in the forwarding path, thus + * requiring GSO support to break it up correctly. + */ + if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) + goto out; + + type = greh->protocol; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (ptype == NULL) + goto out_unlock; + + grehlen = GRE_HEADER_SECTION; + + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + hlen = off + grehlen; + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out_unlock; + } + if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ + __sum16 csum = 0; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum = csum_fold(NAPI_GRO_CB(skb)->csum); + /* Don't trust csum error calculated/reported by h/w */ + if (skb->ip_summed == CHECKSUM_NONE || csum != 0) + csum = gro_skb_checksum(skb); + + /* GRE CSUM is the 1's complement of the 1's complement sum + * of the GRE hdr plus payload so it should add up to 0xffff + * (and 0 after csum_fold()) just like the IPv4 hdr csum. + */ + if (csum) + goto out_unlock; + } + flush = 0; + + for (p = *head; p; p = p->next) { + const struct gre_base_hdr *greh2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + /* The following checks are needed to ensure only pkts + * from the same tunnel are considered for aggregation. + * The criteria for "the same tunnel" includes: + * 1) same version (we only support version 0 here) + * 2) same protocol (we only support ETH_P_IP for now) + * 3) same set of flags + * 4) same key if the key field is present. + */ + greh2 = (struct gre_base_hdr *)(p->data + off); + + if (greh2->flags != greh->flags || + greh2->protocol != greh->protocol) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + if (greh->flags & GRE_KEY) { + /* compare keys */ + if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + } + + skb_gro_pull(skb, grehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, greh, grehlen); + + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); + struct packet_offload *ptype; + unsigned int grehlen = sizeof(*greh); + int err = -ENOENT; + __be16 type; + + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type = SKB_GSO_GRE; + + type = greh->protocol; + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype != NULL) + err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + + rcu_read_unlock(); + return err; +} + static const struct net_offload gre_offload = { .callbacks = { .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, + .gro_receive = gre_gro_receive, + .gro_complete = gre_gro_complete, }, }; -int __init gre_offload_init(void) +static int __init gre_offload_init(void) { return inet_add_offload(&gre_offload, IPPROTO_GRE); } - -void __exit gre_offload_exit(void) -{ - inet_del_offload(&gre_offload, IPPROTO_GRE); -} +device_initcall(gre_offload_init); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5c0e8bc6e5b..42b7bcf8045 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct sock *sk; struct inet_sock *inet; __be32 daddr, saddr; + u32 mark = IP4_REPLY_MARK(net, skb->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; @@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; + sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; @@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = saddr; + fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); @@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, const struct iphdr *iph, - __be32 saddr, u8 tos, + __be32 saddr, u8 tos, u32 mark, int type, int code, struct icmp_bxm *param) { @@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->daddr = (param->replyopts.opt.opt.srr ? param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; + fl4->flowi4_mark = mark; fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; @@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct flowi4 fl4; __be32 saddr; u8 tos; + u32 mark; struct net *net; struct sock *sk; @@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) goto out_unlock; @@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param->skb = skb_in; icmp_param->offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; + sk->sk_mark = mark; ipc.addr = iph->saddr; ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -668,6 +675,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) rcu_read_unlock(); } +static bool icmp_tag_validation(int proto) +{ + bool ok; + + rcu_read_lock(); + ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; + rcu_read_unlock(); + return ok; +} + /* * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. @@ -705,13 +722,23 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { + /* for documentation of the ip_no_pmtu_disc + * values please see + * Documentation/networking/ip-sysctl.txt + */ + switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + default: LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); - } else { - info = ntohs(icmph->un.frag.mtu); - if (!info) + break; + case 2: + goto out; + case 3: + if (!icmp_tag_validation(iph->protocol)) goto out; + /* fall through */ + case 0: + info = ntohs(icmph->un.frag.mtu); } break; case ICMP_SR_FAILED: @@ -886,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb) ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto csum_error; - } + if (skb_checksum_simple_validate(skb)) + goto csum_error; if (!pskb_pull(skb, sizeof(*icmph))) goto error; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7defdc9ba16..db710b059ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -211,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = net_random() % max_delay; + int tv = prandom_u32() % max_delay; im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -220,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = net_random() % in_dev->mr_maxdelay; + int tv = prandom_u32() % in_dev->mr_maxdelay; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) @@ -229,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = net_random() % delay; + int tv = prandom_u32() % delay; if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); @@ -310,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) struct ip_sf_list *psf; int scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -463,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, } first = 1; psf_prev = NULL; - for (psf=*psf_list; psf; psf=psf_next) { + for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; @@ -520,7 +520,7 @@ empty_source: return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ - if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) { + if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } @@ -576,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf=*ppsf; psf; psf = psf_next) { + for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) @@ -600,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; @@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; @@ -764,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev) static void igmp_timer_expire(unsigned long data) { - struct ip_mc_list *im=(struct ip_mc_list *)data; + struct ip_mc_list *im = (struct ip_mc_list *)data; struct in_device *in_dev = im->interface; spin_lock(&im->lock); @@ -794,10 +794,10 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) int i, scount; scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) { + for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != @@ -825,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) /* mark INCLUDE-mode sources */ scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) + for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; @@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto drop; - } + if (skb_checksum_simple_validate(skb)) + goto drop; ih = igmp_hdr(skb); switch (ih->type) { @@ -1103,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); @@ -1121,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; @@ -1134,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) } spin_unlock_bh(&in_dev->mc_tomb_lock); if (pmc) { - for (psf=pmc->tomb; psf; psf=psf_next) { + for (psf = pmc->tomb; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1167,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); - for (; psf; psf=psf_next) { + for (; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1557,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1630,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[sfmode]--; } err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; @@ -1650,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { @@ -1671,7 +1663,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1699,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc) struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -1716,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc) int new_in, rv; rv = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -1726,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; @@ -1748,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { @@ -1800,7 +1792,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]++; err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; @@ -1810,7 +1802,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]--; - for (j=0; j<i; j++) + for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST @@ -1829,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { @@ -1844,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *nextpsf; - for (psf=pmc->tomb; psf; psf=nextpsf) { + for (psf = pmc->tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } pmc->tomb = NULL; - for (psf=pmc->sources; psf; psf=nextpsf) { + for (psf = pmc->sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } @@ -1952,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_lock(); in_dev = ip_mc_find_dev(net, imr); + if (!in_dev) { + ret = -ENODEV; + goto out; + } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; @@ -1969,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - if (in_dev) - ip_mc_dec_group(in_dev, group); + ip_mc_dec_group(in_dev, group); rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } - if (!in_dev) - ret = -ENODEV; +out: rtnl_unlock(); return ret; } @@ -2043,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2062,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); - for (j=i+1; j<psl->sl_count; j++) + for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; @@ -2088,7 +2082,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { - for (i=0; i<psl->sl_count; i++) + for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); @@ -2098,7 +2092,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2106,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } if (rv == 0) /* address already there is an error */ goto done; - for (j=psl->sl_count-1; j>=i; j--) + for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; @@ -2305,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } - for (i=0; i<copycount; i++) { + for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; @@ -2350,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) if (!psl) goto unlock; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } @@ -2423,7 +2417,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { - for (psf=im->sources; psf; psf=psf->sf_next) { + for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } @@ -2762,6 +2756,7 @@ static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; +#endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -2785,8 +2780,9 @@ static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; -int __init igmp_mc_proc_init(void) +int __init igmp_mc_init(void) { +#if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); @@ -2800,5 +2796,7 @@ int __init igmp_mc_proc_init(void) reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; -} +#else + return register_netdevice_notifier(&igmp_notifier); #endif +} diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fc0e649cc00..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,19 +29,16 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - *low = net->ipv4.sysctl_local_ports.range[0]; - *high = net->ipv4.sysctl_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + *low = net->ipv4.ip_local_ports.range[0]; + *high = net->ipv4.ip_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); @@ -109,11 +106,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - smallest_rover = rover = net_random() % remaining + low; + smallest_rover = rover = prandom_u32() % remaining + low; smallest_size = -1; do { - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; @@ -408,7 +405,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct net *net = sock_net(sk); int flags = inet_sk_flowi_flags(sk); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, flags, @@ -445,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, @@ -680,6 +677,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + newsk->sk_mark = inet_rsk(req)->ir_mark; + newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 56a964a553d..e34dccbc4d7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -106,6 +106,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = inet->inet_dport; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = inet->inet_rcv_saddr; r->id.idiag_dst[0] = inet->inet_daddr; @@ -240,12 +244,19 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->idiag_family = tw->tw_family; r->idiag_retrans = 0; + r->id.idiag_if = tw->tw_bound_dev_if; sock_diag_save_cookie(tw, r->id.idiag_cookie); + r->id.idiag_sport = tw->tw_sport; r->id.idiag_dport = tw->tw_dport; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = tw->tw_rcv_saddr; r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; r->idiag_timer = 3; r->idiag_expires = jiffies_to_msecs(tmo); @@ -726,8 +737,13 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = ireq->ir_rmt_port; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = ireq->ir_loc_addr; r->id.idiag_dst[0] = ireq->ir_rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; @@ -914,12 +930,15 @@ skip_listen_ht: spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { int res; + int state; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next_normal; - if (!(r->idiag_states & (1 << sk->sk_state))) + state = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_substate : sk->sk_state; + if (!(r->idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bb075fc9a14..3b01959bf4b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -208,7 +208,7 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) } work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0) { + while (work > 0 || force) { spin_lock(&nf->lru_lock); if (list_empty(&nf->lru_list)) { @@ -278,9 +278,10 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &hb->chain); + inet_frag_lru_add(nf, qp); spin_unlock(&hb->chain_lock); read_unlock(&f->lock); - inet_frag_lru_add(nf, qp); + return qp; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450..43116e8c8e1 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -274,7 +274,7 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif) { - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; @@ -327,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, @@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 1975f52933c..f17ea49b28f 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -230,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, lro_desc->last_skb = skb; } -static void lro_add_frags(struct net_lro_desc *lro_desc, - int len, int hlen, int truesize, - struct skb_frag_struct *skb_frags, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *skb = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb->truesize += truesize; - - skb_frags[0].page_offset += hlen; - skb_frag_size_sub(&skb_frags[0], hlen); - - while (tcp_data_len > 0) { - *(lro_desc->next_frag) = *skb_frags; - tcp_data_len -= skb_frag_size(skb_frags); - lro_desc->next_frag++; - skb_frags++; - skb_shinfo(skb)->nr_frags++; - } -} static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, @@ -371,128 +348,6 @@ out: return 1; } - -static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *mac_hdr, - int hlen, __wsum sum, - u32 ip_summed) -{ - struct sk_buff *skb; - struct skb_frag_struct *skb_frags; - int data_len = len; - int hdr_len = min(len, hlen); - - skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); - if (!skb) - return NULL; - - skb_reserve(skb, lro_mgr->frag_align_pad); - skb->len = len; - skb->data_len = len - hdr_len; - skb->truesize += true_size; - skb->tail += hdr_len; - - memcpy(skb->data, mac_hdr, hdr_len); - - skb_frags = skb_shinfo(skb)->frags; - while (data_len > 0) { - *skb_frags = *frags; - data_len -= skb_frag_size(frags); - skb_frags++; - frags++; - skb_shinfo(skb)->nr_frags++; - } - - skb_shinfo(skb)->frags[0].page_offset += hdr_len; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); - - skb->ip_summed = ip_summed; - skb->csum = sum; - skb->protocol = eth_type_trans(skb, lro_mgr->dev); - return skb; -} - -static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *priv, __wsum sum) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - struct sk_buff *skb; - u64 flags; - void *mac_hdr; - int mac_hdr_len; - int hdr_len = LRO_MAX_PG_HLEN; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_frag_header || - lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { - mac_hdr = skb_frag_address(frags); - goto out1; - } - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out1; - - hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); - mac_hdr_len = (int)((void *)(iph) - mac_hdr); - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out1; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) - goto out1; - - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, 0, lro_mgr->ip_summed_aggr); - if (!skb) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - iph = (void *)(skb->data + vlan_hdr_len); - tcph = (void *)((u8 *)skb->data + vlan_hdr_len - + IP_HDR_LEN(iph)); - - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return NULL; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) - goto out2; - - lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return NULL; - -out2: /* send aggregated packets to the stack */ - lro_flush(lro_mgr, lro_desc); - -out1: /* Original packet has to be posted to the stack */ - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, sum, lro_mgr->ip_summed); -out: - return skb; -} - void lro_receive_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, void *priv) @@ -506,23 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr, } EXPORT_SYMBOL(lro_receive_skb); -void lro_receive_frags(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, void *priv, __wsum sum) -{ - struct sk_buff *skb; - - skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum); - if (!skb) - return; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); -} -EXPORT_SYMBOL(lro_receive_frags); - void lro_flush_all(struct net_lro_mgr *lro_mgr) { int i; @@ -534,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr) } } EXPORT_SYMBOL(lro_flush_all); - -void lro_flush_pkt(struct net_lro_mgr *lro_mgr, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (lro_desc->active) - lro_flush(lro_mgr, lro_desc); -} -EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 33d5537881e..bd5f5928167 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -26,20 +26,7 @@ * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. - * At this moment this information consists only of ID field for the next - * outgoing IP packet. This field is incremented with each packet as encoded - * in inet_getid() function (include/net/inetpeer.h). - * At the moment of writing this notes identifier of IP packets is generated - * to be unpredictable using this code only for packets subjected - * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size when local fragmentation is disabled use a constant ID and do - * not use this code (see ip_select_ident() in include/net/ip.h). * - * Route cache entries hold references to our nodes. - * New cache entries get references via lookup by destination IP address in - * the avl tree. The reference is grabbed only when it's needed i.e. only - * when we try to output IP packet which needs an unpredictable ID (see - * __ip_select_ident() in net/ipv4/route.c). * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can @@ -62,7 +49,6 @@ * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable - * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; @@ -109,13 +95,6 @@ static inline void flush_check(struct inet_peer_base *base, int family) } } -void inetpeer_invalidate_family(int family) -{ - atomic_t *fp = inetpeer_seq_ptr(family); - - atomic_inc(fp); -} - #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ /* Exported for sysctl_net_ipv4. */ @@ -127,7 +106,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min static void inetpeer_gc_worker(struct work_struct *work) { struct inet_peer *p, *n, *c; - LIST_HEAD(list); + struct list_head list; spin_lock_bh(&gc_lock); list_replace_init(&gc_list, &list); @@ -227,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a, stackptr = _stack; \ *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty; ) { \ + u != peer_avl_empty;) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ @@ -282,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu; ) { \ + u->avl_right != peer_avl_empty_rcu;) { \ v = &u->avl_right; \ *stackptr++ = v; \ u = rcu_deref_locked(*v, base); \ @@ -504,10 +483,6 @@ relookup: p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, - (daddr->family == AF_INET) ? - secure_ip_id(daddr->addr.a4) : - secure_ipv6_id(daddr->addr.a6)); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first @@ -529,7 +504,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 694de3b7aeb..3a83ce5efa8 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,24 @@ #include <net/route.h> #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + skb->ignore_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -54,10 +72,15 @@ static int ip_forward_finish(struct sk_buff *skb) int ip_forward(struct sk_buff *skb) { + u32 mtu; struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + /* that should never happen */ + if (skb->pkt_type != PACKET_HOST) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -67,9 +90,6 @@ int ip_forward(struct sk_buff *skb) if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; - if (skb->pkt_type != PACKET_HOST) - goto drop; - skb_forward_csum(skb); /* @@ -88,11 +108,12 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IPCB(skb)->flags |= IPSKB_FORWARDED; + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->dst))); + htonl(mtu)); goto drop; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2481993a497..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -232,8 +232,9 @@ static void ip_expire(unsigned long arg) * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || - (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL)) + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; @@ -704,7 +705,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) return NULL; - skb->rxhash = 0; + skb_clear_hash(skb); } } return skb; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d7aea4c5b94..9b842544aea 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -178,7 +178,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info, else itn = net_generic(net, ipgre_net_id); - iph = (const struct iphdr *)skb->data; + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); @@ -217,6 +217,7 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { + skb_pop_mac_header(skb); ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); return PACKET_RCVD; } @@ -277,7 +278,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_dropped++; return NETDEV_TX_OK; @@ -300,7 +301,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_dropped++; return NETDEV_TX_OK; @@ -409,7 +410,7 @@ static int ipgre_open(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_gre(dev_net(dev), &fl4, + rt = ip_route_output_gre(t->net, &fl4, t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, @@ -433,7 +434,7 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; - in_dev = inetdev_by_index(dev_net(dev), t->mlink); + in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } @@ -462,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = { static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; + dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } @@ -476,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev) dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; - dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { @@ -500,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_IPGRE; dev->flags = IFF_NOARP; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->addr_len = 4; @@ -648,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 054a3e97d82..3d4da2c16b6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec7264514a8..ad382499bac 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -167,7 +167,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); - for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. @@ -227,7 +227,7 @@ void ip_options_fragment(struct sk_buff *skb) continue; } optlen = optptr[1]; - if (optlen<2 || optlen>l) + if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); @@ -275,27 +275,31 @@ int ip_options_compile(struct net *net, for (l = opt->optlen; l > 0; ) { switch (*optptr) { - case IPOPT_END: - for (optptr++, l--; l>0; optptr++, l--) { + case IPOPT_END: + for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; - case IPOPT_NOOP: + case IPOPT_NOOP: l--; optptr++; continue; } + if (unlikely(l < 2)) { + pp_ptr = optptr; + goto error; + } optlen = optptr[1]; - if (optlen<2 || optlen>l) { + if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { - case IPOPT_SSRR: - case IPOPT_LSRR: + case IPOPT_SSRR: + case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; @@ -321,7 +325,7 @@ int ip_options_compile(struct net *net, opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; - case IPOPT_RR: + case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; @@ -349,7 +353,7 @@ int ip_options_compile(struct net *net, } opt->rr = optptr - iph; break; - case IPOPT_TIMESTAMP: + case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; @@ -364,19 +368,19 @@ int ip_options_compile(struct net *net, } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; - if (optptr[2]+3 > optptr[1]) { + if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { - case IPOPT_TS_TSONLY: + case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; - case IPOPT_TS_TSANDADDR: - if (optptr[2]+7 > optptr[1]) { + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } @@ -389,8 +393,8 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - case IPOPT_TS_PRESPEC: - if (optptr[2]+7 > optptr[1]) { + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } @@ -405,7 +409,7 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - default: + default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; @@ -433,7 +437,7 @@ int ip_options_compile(struct net *net, } opt->ts = optptr - iph; break; - case IPOPT_RA: + case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; @@ -441,7 +445,7 @@ int ip_options_compile(struct net *net, if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; - case IPOPT_CIPSO: + case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; @@ -452,9 +456,9 @@ int ip_options_compile(struct net *net, goto error; } break; - case IPOPT_SEC: - case IPOPT_SID: - default: + case IPOPT_SEC: + case IPOPT_SID: + default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; @@ -572,7 +576,7 @@ void ip_forward_options(struct sk_buff *skb) optptr = raw + opt->srr; - for ( srrptr=optptr[2], srrspace = optptr[1]; + for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { @@ -628,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt->rt_type != RTN_LOCAL) return -EINVAL; - for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 912402752f2..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } +static int ip_finish_output_gso(struct sk_buff *skb) +{ + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); + + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) sizeof(fl4->saddr) + sizeof(fl4->daddr)); } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -371,7 +415,7 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; @@ -386,9 +430,9 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(skb, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -422,9 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - to->nf_trace = from->nf_trace; -#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif @@ -458,12 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || + mtu = ip_skb_dst_mtu(skb); + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { + IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(ip_skb_dst_mtu(skb))); + htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } @@ -473,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ + mtu = mtu - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -823,12 +865,11 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, - mtu-exthdrlen); + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -1147,11 +1188,11 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; if (cork->length + size > maxnonfragsize - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -1308,11 +1349,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc < IP_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. - * If local_df is set too, we still allow to fragment this frame + * If ignore_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || inet->pmtudisc == IP_PMTUDISC_PROBE || @@ -1338,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt) { iph->ihl += opt->optlen>>2; @@ -1505,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, 0, + flowi4_init_output(&fl4, arg->bound_dev_if, + IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), @@ -1550,7 +1591,7 @@ void __init ip_init(void) ip_rt_init(); inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) - igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) + igmp_mc_init(); #endif } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ddf32a6bc41..64741b93863 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -56,7 +56,6 @@ /* * SOL_IP control messages. */ -#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { @@ -187,7 +186,8 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) } EXPORT_SYMBOL(ip_cmsg_recv); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, + bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; @@ -195,6 +195,22 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; +#if defined(CONFIG_IPV6) + if (allow_ipv6 && + cmsg->cmsg_level == SOL_IPV6 && + cmsg->cmsg_type == IPV6_PKTINFO) { + struct in6_pktinfo *src_info; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) + return -EINVAL; + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) + return -EINVAL; + ipc->oif = src_info->ipi6_ifindex; + ipc->addr = src_info->ipi6_addr.s6_addr32[3]; + continue; + } +#endif if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -390,7 +406,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; @@ -416,7 +432,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + @@ -628,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->nodefrag = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; inet->pmtudisc = val; break; @@ -1051,14 +1066,15 @@ e_inval: * * To support IP_CMSG_PKTINFO option, we store rt_iif and specific * destination in skb->cb[] before dst drop. - * This way, receiver doesnt make cache line misses to read rtable. + * This way, receiver doesn't make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); + bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || + ipv6_sk_rxinfo(sk); - if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) && - skb_rtable(skb)) { + if (prepare && skb_rtable(skb)) { pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 90ff9570d7d..6f9de61dce5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/rculist.h> +#include <linux/err.h> #include <net/sock.h> #include <net/ip.h> @@ -61,57 +62,59 @@ #include <net/ip6_route.h> #endif -static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, - __be32 key, __be32 remote) +static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) { return hash_32((__force u32)key ^ (__force u32)remote, IP_TNL_HASH_BITS); } -/* Often modified stats are per cpu, other are shared (netdev->stats) */ -struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) +static void __tunnel_dst_set(struct ip_tunnel_dst *idst, + struct dst_entry *dst) { - int i; + struct dst_entry *old_dst; - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } + dst_clone(dst); + old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); + dst_release(old_dst); +} - tot->multicast = dev->stats.multicast; +static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ + __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); +} - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; +static void tunnel_dst_reset(struct ip_tunnel *t) +{ + tunnel_dst_set(t, NULL); +} - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; +void ip_tunnel_dst_reset_all(struct ip_tunnel *t) +{ + int i; - tot->collisions = dev->stats.collisions; + for_each_possible_cpu(i) + __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); +} +EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - return tot; +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +{ + struct dst_entry *dst; + + rcu_read_lock(); + dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + if (dst) { + if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + tunnel_dst_reset(t); + dst_release(dst); + dst = NULL; + } + } + rcu_read_unlock(); + return (struct rtable *)dst; } -EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) @@ -146,7 +149,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; - hash = ip_tunnel_hash(itn, key, remote); + hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { @@ -166,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || + t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -178,14 +182,15 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, cand = t; } - hash = ip_tunnel_hash(itn, key, 0); + hash = ip_tunnel_hash(key, 0); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { - if ((local != t->parms.iph.saddr && - (local != t->parms.iph.daddr || - !ipv4_is_multicast(local))) || - !(t->dev->flags & IFF_UP)) + if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && + (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) + continue; + + if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) @@ -202,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (t->parms.i_key != key || + t->parms.iph.saddr != 0 || + t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -228,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, { unsigned int h; __be32 remote; + __be32 i_key = parms->i_key; if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) remote = parms->iph.daddr; else remote = 0; - h = ip_tunnel_hash(itn, parms->i_key, remote); + if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + i_key = 0; + + h = ip_tunnel_hash(i_key, remote); return &itn->tunnels[h]; } @@ -257,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; + __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); @@ -264,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - key == t->parms.i_key && link == t->parms.link && - type == t->dev->type) + type == t->dev->type && + ip_tunnel_key_match(&t->parms, flags, key)) break; } return t; @@ -318,11 +330,10 @@ failed: return ERR_PTR(err); } -static inline struct rtable *ip_route_output_tunnel(struct net *net, - struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif) +static inline void init_tunnel_flow(struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = oif; @@ -331,7 +342,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net, fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; - return ip_route_output_key(net, fl4); } static int ip_tunnel_bind_dev(struct net_device *dev) @@ -350,14 +360,14 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_tunnel(tunnel->net, &fl4, - tunnel->parms.iph.protocol, - iph->daddr, iph->saddr, - tunnel->parms.o_key, - RT_TOS(iph->tos), - tunnel->parms.link); + init_tunnel_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link); + rt = ip_route_output_key(tunnel->net, &fl4); + if (!IS_ERR(rt)) { tdev = rt->dst.dev; + tunnel_dst_set(tunnel, &rt->dst); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -386,14 +396,13 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, struct ip_tunnel_parm *parms) { - struct ip_tunnel *nt, *fbt; + struct ip_tunnel *nt; struct net_device *dev; BUG_ON(!itn->fb_tunnel_dev); - fbt = netdev_priv(itn->fb_tunnel_dev); dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); if (IS_ERR(dev)) - return NULL; + return ERR_CAST(dev); dev->mtu = ip_tunnel_bind_dev(dev); @@ -405,15 +414,12 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, bool log_ecn_error) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); int err; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; tunnel->dev->stats.multicast++; skb->pkt_type = PACKET_BROADCAST; } @@ -436,6 +442,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } + skb_reset_network_header(skb); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -532,8 +540,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; + bool connected; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + connected = (tunnel->parms.iph.daddr != 0); dst = tnl_params->daddr; if (dst == 0) { @@ -581,27 +591,38 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, #endif else goto tx_error; + + connected = false; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + connected = false; + } else if (skb->protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + connected = false; + } } - rt = ip_route_output_tunnel(tunnel->net, &fl4, - protocol, - dst, tnl_params->saddr, - tunnel->parms.o_key, - RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + + rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (connected) + tunnel_dst_set(tunnel, &rt->dst); } + if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; @@ -618,6 +639,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; @@ -646,12 +668,13 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dev->needed_headroom = max_headroom; if (skb_cow_head(skb, dev->needed_headroom)) { + ip_rt_put(rt); dev->stats.tx_dropped++; - dev_kfree_skb(skb); + kfree_skb(skb); return; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -663,7 +686,7 @@ tx_error_icmp: #endif tx_error: dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); @@ -696,25 +719,25 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } + ip_tunnel_dst_reset_all(t); netdev_state_change(dev); } int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; + struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: - t = NULL; - if (dev == itn->fb_tunnel_dev) + if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (t == NULL) - t = netdev_priv(dev); + if (t == NULL) + t = netdev_priv(dev); + } memcpy(p, &t->parms, sizeof(*p)); break; @@ -725,16 +748,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); - if (!(p->i_flags&TUNNEL_KEY)) - p->i_key = 0; - if (!(p->o_flags&TUNNEL_KEY)) - p->o_key = 0; + if (!(p->i_flags & VTI_ISVTI)) { + if (!(p->i_flags & TUNNEL_KEY)) + p->i_key = 0; + if (!(p->o_flags & TUNNEL_KEY)) + p->o_key = 0; + } t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (!t && (cmd == SIOCADDTUNNEL)) + if (!t && (cmd == SIOCADDTUNNEL)) { t = ip_tunnel_create(net, itn, p); - + err = PTR_ERR_OR_ZERO(t); + break; + } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { @@ -761,8 +788,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (t) { err = 0; ip_tunnel_update(itn, t, dev, p, true); - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + } else { + err = -ENOENT; + } break; case SIOCDELTUNNEL: @@ -811,6 +839,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -855,11 +884,12 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); } rtnl_unlock(); - return PTR_RET(itn->fb_tunnel_dev); + return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); @@ -976,21 +1006,22 @@ int ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - int i, err; + int err; dev->destructor = ip_tunnel_dev_free; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_tstats *ipt_stats; - ipt_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ipt_stats->syncp); + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1015,6 +1046,8 @@ void ip_tunnel_uninit(struct net_device *dev) /* fb_tunnel_dev will be unregisted in net-exit call. */ if (itn->fb_tunnel_dev != dev) ip_tunnel_del(netdev_priv(dev)); + + ip_tunnel_dst_reset_all(tunnel); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 42ffbc8d65c..f4c987bb7e9 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,7 +46,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { @@ -56,7 +56,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, skb_scrub_packet(skb, xnet); - skb->rxhash = 0; + skb_clear_hash(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -74,9 +74,9 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out(skb); + err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; @@ -107,8 +107,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) nf_reset(skb); secpath_reset(skb); - if (!skb->l4_rxhash) - skb->rxhash = 0; + skb_clear_hash_if_not_l4(skb); skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); @@ -136,6 +135,14 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } + /* If packet is not gso and we are resolving any partial checksum, + * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL + * on the outer header without confusing devices that implement + * NETIF_F_IP_CSUM with encapsulation. + */ + if (csum_help) + skb->encapsulation = 0; + if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { err = skb_checksum_help(skb); if (unlikely(err)) @@ -149,3 +156,49 @@ error: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); + +/* Often modified stats are per cpu, other are shared (netdev->stats) */ +struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; + } + + tot->multicast = dev->stats.multicast; + + tot->rx_crc_errors = dev->stats.rx_crc_errors; + tot->rx_fifo_errors = dev->stats.rx_fifo_errors; + tot->rx_length_errors = dev->stats.rx_length_errors; + tot->rx_frame_errors = dev->stats.rx_frame_errors; + tot->rx_errors = dev->stats.rx_errors; + + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + + tot->collisions = dev->stats.collisions; + + return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 52b802a0cd8..b8960f3527f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -34,6 +34,7 @@ #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> +#include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> @@ -49,8 +50,8 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; static int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); -/* We dont digest the packet therefore let the packet pass */ -static int vti_rcv(struct sk_buff *skb) +static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -60,79 +61,120 @@ static int vti_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel != NULL) { - struct pcpu_tstats *tstats; - u32 oldmark = skb->mark; - int ret; - - - /* temporarily mark the skb with the tunnel o_key, to - * only match policies with this mark. - */ - skb->mark = be32_to_cpu(tunnel->parms.o_key); - ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); - skb->mark = oldmark; - if (!ret) - return -1; - - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - secpath_reset(skb); - skb->dev = tunnel->dev; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + skb->mark = be32_to_cpu(tunnel->parms.i_key); + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + +static int vti_rcv(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); +} + +static int vti_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + + if (!tunnel) return 1; + + dev = tunnel->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; } - return -1; + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; } -/* This function assumes it is being called from dev_queue_xmit() - * and that skb is filled properly by that function. - */ +static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)&dst; + xfrm_address_t *saddr = (xfrm_address_t *)&src; -static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET) + return false; + + if (!dst) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) + return false; + + return true; +} + +static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, + struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *tiph = &tunnel->parms.iph; - u8 tos; - struct rtable *rt; /* Route to the other host */ + struct ip_tunnel_parm *parms = &tunnel->parms; + struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - __be32 dst = tiph->daddr; - struct flowi4 fl4; int err; - if (skb->protocol != htons(ETH_P_IP)) - goto tx_error; - - tos = old_iph->tos; + if (!dst) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } - memset(&fl4, 0, sizeof(fl4)); - flowi4_init_output(&fl4, tunnel->parms.link, - be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), - RT_SCOPE_UNIVERSE, - IPPROTO_IPIP, 0, - dst, tiph->saddr, 0, 0); - rt = ip_route_output_key(dev_net(dev), &fl4); - if (IS_ERR(rt)) { + dst_hold(dst); + dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - /* if there is no transform then this tunnel is not functional. - * Or if the xfrm is not mode tunnel. - */ - if (!rt->dst.xfrm || - rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { + + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { dev->stats.tx_carrier_errors++; - ip_rt_put(rt); + dst_release(dst); goto tx_error_icmp; } - tdev = rt->dst.dev; + + tdev = dst->dev; if (tdev == dev) { - ip_rt_put(rt); + dst_release(dst); dev->stats.collisions++; goto tx_error; } @@ -146,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) tunnel->err_count = 0; } - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - nf_reset(skb); + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(skb); @@ -162,10 +202,102 @@ tx_error_icmp: dst_link_failure(skb); tx_error: dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + skb->mark = be32_to_cpu(tunnel->parms.o_key); + + switch (skb->protocol) { + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + default: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + return vti_xmit(skb, dev, &fl); +} + +static int vti4_err(struct sk_buff *skb, u32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip_tunnel *tunnel; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct iphdr *iph = (const struct iphdr *)skb->data; + int protocol = iph->protocol; + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!tunnel) + return -1; + + mark = be32_to_cpu(tunnel->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + static int vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -181,12 +313,19 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } + if (!(p.i_flags & GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags & GRE_KEY)) + p.o_key = 0; + + p.i_flags = VTI_ISVTI; + err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY | VTI_ISVTI; + p.i_flags |= GRE_KEY; p.o_flags |= GRE_KEY; } @@ -207,6 +346,7 @@ static const struct net_device_ops vti_netdev_ops = { static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; + dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } @@ -218,13 +358,11 @@ static int vti_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; @@ -241,9 +379,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; } -static struct xfrm_tunnel_notifier vti_handler __read_mostly = { +static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv, - .priority = 1, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, }; static int __net_init vti_init_net(struct net *net) @@ -287,6 +444,8 @@ static void vti_netlink_parms(struct nlattr *data[], if (!data) return; + parms->i_flags = VTI_ISVTI; + if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -382,10 +541,31 @@ static int __init vti_init(void) err = register_pernet_device(&vti_net_ops); if (err < 0) return err; - err = xfrm4_mode_tunnel_input_register(&vti_handler); + err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) { unregister_pernet_device(&vti_net_ops); pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); + if (err < 0) { + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; } err = rtnl_link_register(&vti_link_ops); @@ -395,7 +575,9 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_mode_tunnel_input_deregister(&vti_handler); + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); return err; } @@ -403,8 +585,13 @@ rtnl_link_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) + if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) + pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) + pr_info("vti close: can't deregister tunnel\n"); + unregister_pernet_device(&vti_net_ops); } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 826be4cb482..c0855d50a3f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -23,7 +23,7 @@ #include <net/protocol.h> #include <net/sock.h> -static void ipcomp4_err(struct sk_buff *skb, u32 info) +static int ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; @@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); + + return 0; } /* We always hold one tunnel user reference to indicate a tunnel */ @@ -147,6 +149,11 @@ out: return err; } +static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ipcomp_type = { .description = "IPCOMP4", .owner = THIS_MODULE, @@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = { .output = ipcomp_output }; -static const struct net_protocol ipcomp4_protocol = { +static struct xfrm4_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ipcomp4_rcv_cb, .err_handler = ipcomp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ipcomp4_init(void) @@ -170,7 +178,7 @@ static int __init ipcomp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); return -EAGAIN; @@ -180,7 +188,7 @@ static int __init ipcomp4_init(void) static void __exit ipcomp4_fini(void) { - if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index efa1138fa52..b3e86ea7b71 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -273,7 +273,7 @@ static int __init ic_open_devs(void) msleep(1); - if time_before(jiffies, next_msg) + if (time_before(jiffies, next_msg)) continue; elapsed = jiffies_to_msecs(jiffies - start); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fe3e9f7f1f0..62eaa005e14 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->dev->ifindex, 0, IPPROTO_IPIP, 0); + t->parms.link, 0, IPPROTO_IPIP, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, IPPROTO_IPIP, 0); err = 0; goto out; @@ -228,7 +228,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_errors++; return NETDEV_TX_OK; @@ -486,4 +486,5 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 421a24934ff..65bcaa78904 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - struct ipmr_result res; - struct fib_lookup_arg arg = { .result = &res, }; int err; + struct ipmr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); @@ -452,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_iif = skb->skb_iif, + .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; @@ -481,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev) dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; - dev->netdev_ops = ®_vif_netdev_ops, + dev->netdev_ops = ®_vif_netdev_ops; dev->destructor = free_netdev; dev->features |= NETIF_F_NETNS_LOCAL; } @@ -1660,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(skb, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -2252,13 +2255,14 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - u32 portid, u32 seq, struct mfc_cache *c, int cmd) + u32 portid, u32 seq, struct mfc_cache *c, int cmd, + int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI); + nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2326,7 +2330,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, if (skb == NULL) goto errout; - err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd); + err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; @@ -2365,7 +2369,8 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (ipmr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE) < 0) + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) goto done; next_entry: e++; @@ -2379,7 +2384,8 @@ next_entry: if (ipmr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE) < 0) { + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) { spin_unlock_bh(&mfc_unres_lock); goto done; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3e0adea9c2..7ebd6e37875 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); if (IS_ERR(dst)) - return PTR_ERR(dst);; + return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 40d56073cd1..a26ce035e3f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -39,23 +39,38 @@ config NF_CONNTRACK_PROC_COMPAT config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" - -config NFT_REJECT_IPV4 - depends on NF_TABLES_IPV4 - tristate "nf_tables IPv4 reject support" + help + This option enables the IPv4 support for nf_tables. config NFT_CHAIN_ROUTE_IPV4 depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables route chain support" + help + This option enables the "route" chain for IPv4 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, type of service and + the packet mark. config NFT_CHAIN_NAT_IPV4 depends on NF_TABLES_IPV4 depends on NF_NAT_IPV4 && NFT_NAT tristate "IPv4 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv4 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + default NFT_REJECT + tristate config NF_TABLES_ARP depends on NF_TABLES tristate "ARP nf_tables support" + help + This option enables the ARP support for nf_tables. config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19df72b7ba8..90b82405331 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -28,9 +28,9 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o -obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cde072..f95b6f93814 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd30cb..99e810f8467 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b969131ad1c..5b6e0df4ccf 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -17,10 +17,6 @@ #include <linux/udp.h> #include <linux/icmp.h> #include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/route.h> -#include <net/dst.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_REJECT.h> @@ -28,128 +24,12 @@ #include <linux/netfilter_bridge.h> #endif +#include <net/netfilter/ipv4/nf_reject.h> + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); - if (oth == NULL) - return; - - /* No RST for RST. */ - if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; - - /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; - - skb_reserve(nskb, LL_MAX_HEADER); - - skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - niph->version = 4; - niph->ihl = sizeof(struct iphdr) / 4; - niph->tos = 0; - niph->id = 0; - niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; - niph->check = 0; - niph->saddr = oiph->daddr; - niph->daddr = oiph->saddr; - - skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); - tcph->source = oth->dest; - tcph->dest = oth->source; - tcph->doff = sizeof(struct tcphdr) / 4; - - if (oth->ack) - tcph->seq = oth->ack_seq; - else { - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + - oldskb->len - ip_hdrlen(oldskb) - - (oth->doff << 2)); - tcph->ack = 1; - } - - tcph->rst = 1; - tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, - niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)tcph - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set_noref(nskb, skb_dst(oldskb)); - - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) - goto free_nskb; - - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - - /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) - goto free_nskb; - - nf_ct_attach(nskb, oldskb); - -#ifdef CONFIG_BRIDGE_NETFILTER - /* If we use ip_local_out for bridged traffic, the MAC source on - * the RST will be ours, instead of the destination's. This confuses - * some routers/firewalls, and they drop the packet. So we need to - * build the eth header using the original destination's MAC as the - * source, and send the RST packet directly. - */ - if (oldskb->nf_bridge) { - struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) - goto free_nskb; - dev_queue_xmit(nskb); - } else -#endif - ip_local_out(nskb); - - return; - - free_nskb: - kfree_skb(nskb); -} - -static inline void send_unreach(struct sk_buff *skb_in, int code) -{ - icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); -} - static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -157,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - send_unreach(skb, ICMP_NET_UNREACH); + nf_send_unreach(skb, ICMP_NET_UNREACH); break; case IPT_ICMP_HOST_UNREACHABLE: - send_unreach(skb, ICMP_HOST_UNREACH); + nf_send_unreach(skb, ICMP_HOST_UNREACH); break; case IPT_ICMP_PROT_UNREACHABLE: - send_unreach(skb, ICMP_PROT_UNREACH); + nf_send_unreach(skb, ICMP_PROT_UNREACH); break; case IPT_ICMP_PORT_UNREACHABLE: - send_unreach(skb, ICMP_PORT_UNREACH); + nf_send_unreach(skb, ICMP_PORT_UNREACH); break; case IPT_ICMP_NET_PROHIBITED: - send_unreach(skb, ICMP_NET_ANO); + nf_send_unreach(skb, ICMP_NET_ANO); break; case IPT_ICMP_HOST_PROHIBITED: - send_unreach(skb, ICMP_HOST_ANO); + nf_send_unreach(skb, ICMP_HOST_ANO); break; case IPT_ICMP_ADMIN_PROHIBITED: - send_unreach(skb, ICMP_PKT_FILTERED); + nf_send_unreach(skb, ICMP_PKT_FILTERED); break; case IPT_TCP_RESET: - send_reset(skb, par->hooknum); + nf_send_reset(skb, par->hooknum); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f13bd91d9a5..a313c3fbeb4 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -423,6 +423,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) static struct xt_target synproxy_tg4_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV4, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg4, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg4_check, diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c49dcd0284a..4bfaedf9b34 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -89,11 +89,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ipv4_is_multicast(iph->daddr)) { if (ipv4_is_zeronet(iph->saddr)) return ipv4_is_local_multicast(iph->daddr) ^ invert; - flow.flowi4_iif = 0; - } else { - flow.flowi4_iif = LOOPBACK_IFINDEX; } - + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ee2886126e3..f1787c04a4d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -91,17 +91,9 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, if (nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (!nat) { - /* NAT module was loaded late. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index ecd8bec411c..8127dc80286 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -548,9 +548,3 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) module_init(nf_conntrack_l3proto_ipv4_init); module_exit(nf_conntrack_l3proto_ipv4_fini); - -void need_ipv4_conntrack(void) -{ - return; -} -EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 12e13bd82b5..b8f6381c7d0 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) err = ip_defrag(skb, user); local_bh_enable(); - if (!err) + if (!err) { ip_send_check(ip_hdr(skb)); + skb->ignore_df = 1; + } return err; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9eea059dd62..574f7ebba0b 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -229,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); nated_port = 0; break; diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 61a942265e8..7c676671329 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -461,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, } if (subid < 40) { - optr [0] = 0; - optr [1] = subid; + optr[0] = 0; + optr[1] = subid; } else if (subid < 80) { - optr [0] = 1; - optr [1] = subid - 40; + optr[0] = 1; + optr[1] = subid - 40; } else { - optr [0] = 2; - optr [1] = subid - 80; + optr[0] = 2; + optr[1] = subid - 80; } *len = 2; @@ -1198,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct, map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); } else { /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); + map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); + map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip); } if (map.from == map.to) diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 3e67ef1c676..19412a4063f 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -14,10 +14,30 @@ #include <linux/netfilter_arp.h> #include <net/netfilter/nf_tables.h> +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + static struct nft_af_info nft_af_arp __read_mostly = { .family = NFPROTO_ARP, .nhooks = NF_ARP_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + [NF_ARP_FORWARD] = nft_do_chain_arp, + }, }; static int nf_tables_arp_init_net(struct net *net) @@ -48,32 +68,14 @@ static struct pernet_operations nf_tables_arp_net_ops = { .exit = nf_tables_arp_exit_net, }; -static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_arp = { - .family = NFPROTO_ARP, +static const struct nf_chain_type filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_ARP, + .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), - .fn = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, }; static int __init nf_tables_arp_init(void) diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 0f4cbfeb19b..6820c8c4084 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,14 +18,25 @@ #include <net/ip.h> #include <net/netfilter/nf_tables_ipv4.h> +static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct nft_pktinfo pkt; - if (unlikely(skb->len < sizeof(struct iphdr) || ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { if (net_ratelimit()) @@ -33,19 +44,24 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, "packet\n"); return NF_ACCEPT; } - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - return nft_do_chain_pktinfo(&pkt, ops); + return nft_do_chain_ipv4(ops, skb, in, out, okfn); } -static struct nft_af_info nft_af_ipv4 __read_mostly = { +struct nft_af_info nft_af_ipv4 __read_mostly = { .family = NFPROTO_IPV4, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_ipv4_output, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; +EXPORT_SYMBOL_GPL(nft_af_ipv4); static int nf_tables_ipv4_init_net(struct net *net) { @@ -75,42 +91,28 @@ static struct pernet_operations nf_tables_ipv4_net_ops = { .exit = nf_tables_ipv4_exit_net, }; -static unsigned int -nft_do_chain_ipv4(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_ipv4 = { - .family = NFPROTO_IPV4, +static const struct nf_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), - .fn = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, }; static int __init nf_tables_ipv4_init(void) { + int ret; + nft_register_chain_type(&filter_ipv4); - return register_pernet_subsys(&nf_tables_ipv4_net_ops); + ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv4); + + return ret; } static void __exit nf_tables_ipv4_exit(void) diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index cf2c792cd97..3964157d826 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -48,15 +48,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - nat = nfct_nat(ct); - if (nat == NULL) { - /* Conntrack module was loaded late, can't add extension. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) - return NF_ACCEPT; - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: @@ -75,7 +69,7 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_ACCEPT) return ret; if (!nf_nat_initialized(ct, maniptype)) { @@ -164,21 +158,21 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_nat_ipv4 = { - .family = NFPROTO_IPV4, +static const struct nf_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .fn = { + .hooks = { [NF_INET_PRE_ROUTING] = nf_nat_prerouting, [NF_INET_POST_ROUTING] = nf_nat_postrouting, [NF_INET_LOCAL_OUT] = nf_nat_output, [NF_INET_LOCAL_IN] = nf_nat_fn, }, - .me = THIS_MODULE, }; static int __init nft_chain_nat_init(void) diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 4e6bf9a3d7a..125b66766c0 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -47,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -61,15 +61,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_route_ipv4 = { - .family = NFPROTO_IPV4, +static const struct nf_chain_type nft_chain_route_ipv4 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_OUT), - .fn = { + .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_hook, }, - .me = THIS_MODULE, }; static int __init nft_chain_route_init(void) diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index fff5ba1a33b..e79718a382f 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,108 +17,59 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0); + nf_send_unreach(pkt->skb, priv->icmp_code); break; case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); break; } data[NFT_REG_VERDICT].verdict = NF_DROP; } +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, - [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, -}; - -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_reject *priv = nft_expr_priv(expr); - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type)) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - } - - return 0; - -nla_put_failure: - return -1; -} - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { + .type = &nft_reject_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, + .eval = nft_reject_ipv4_eval, .init = nft_reject_init, .dump = nft_reject_dump, }; -static struct nft_expr_type nft_reject_type __read_mostly = { +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, .name = "reject", - .ops = &nft_reject_ops, + .ops = &nft_reject_ipv4_ops, .policy = nft_reject_policy, .maxattr = NFTA_REJECT_MAX, .owner = THIS_MODULE, }; -static int __init nft_reject_module_init(void) +static int __init nft_reject_ipv4_module_init(void) { - return nft_register_expr(&nft_reject_type); + return nft_register_expr(&nft_reject_ipv4_type); } -static void __exit nft_reject_module_exit(void) +static void __exit nft_reject_ipv4_module_exit(void) { - nft_unregister_expr(&nft_reject_type); + nft_unregister_expr(&nft_reject_ipv4_type); } -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("reject"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 242e7f4ed6f..044a0ddf6a7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -53,8 +53,12 @@ #include <net/transp_v6.h> #endif +struct ping_table { + struct hlist_nulls_head hash[PING_HTABLE_SIZE]; + rwlock_t lock; +}; -struct ping_table ping_table; +static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); @@ -232,15 +236,15 @@ exit: static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { - kgid_t *data = net->ipv4.sysctl_ping_group_range; + kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } @@ -248,26 +252,33 @@ int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); - struct group_info *group_info = get_current_groups(); - int i, j, count = group_info->ngroups; + struct group_info *group_info; + int i, j, count; kgid_t low, high; + int ret = 0; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; + group_info = get_current_groups(); + count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { kgid_t gid = group_info->blocks[i][j]; if (gid_lte(low, gid) && gid_lte(gid, high)) - return 0; + goto out_release_group; } count -= cp_count; } - return -EACCES; + ret = -EACCES; + +out_release_group: + put_group_info(group_info); + return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); @@ -316,6 +327,9 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr_len < sizeof(*addr)) return -EINVAL; + if (addr->sin6_family != AF_INET6) + return -EINVAL; + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); @@ -668,8 +682,8 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, } EXPORT_SYMBOL_GPL(ping_common_sendmsg); -int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; @@ -696,7 +710,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) @@ -720,7 +734,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) return err; if (ipc.opt) @@ -869,7 +883,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Copy the address and add cmsg data. */ if (family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; @@ -886,8 +900,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else if (family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *ip6 = ipv6_hdr(skb); - struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; @@ -903,7 +916,12 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (inet6_sk(sk)->rxopt.all) - pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb); + pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); + if (skb->protocol == htons(ETH_P_IPV6) && + inet6_sk(sk)->rxopt.all) + pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); + else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) + ip_cmsg_recv(msg, skb); #endif } else { BUG(); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8ecd7ad959b..ae0af9386f7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), + SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), @@ -280,6 +281,11 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), + SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), + SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), + SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), + SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), + SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_SENTINEL }; @@ -333,22 +339,22 @@ static void icmp_put(struct seq_file *seq) atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_printf(seq, " OutMsgs OutErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } @@ -373,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); @@ -389,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -404,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udp_statistics, + snmp_fold_field(net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -415,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udplite_statistics, + snmp_fold_field(net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -452,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.net_statistics, + snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -462,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 23c3e5b5bb5..2c65160565e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -493,7 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (msg->msg_namelen) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; @@ -524,7 +524,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -575,7 +575,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | + inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); @@ -690,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f8da2827801..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -89,6 +89,7 @@ #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/jhash.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -112,9 +113,6 @@ #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#define IP_MAX_MTU 0xFFFF - #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; @@ -142,11 +140,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) -{ -} - static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); @@ -165,7 +158,6 @@ static struct dst_ops ipv4_dst_ops = { .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, @@ -197,7 +189,7 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -465,39 +457,45 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { + atomic_t id; + u32 stamp32; +}; + +static struct ip_ident_bucket *ip_idents __read_mostly; + +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. */ -static void ip_select_fb_ident(struct iphdr *iph) +u32 ip_idents_reserve(u32 hash, int segs) { - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; + struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(bucket->stamp32); + u32 now = (u32)jiffies; + u32 delta = 0; - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); + if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + delta = prandom_u32_max(now - old); + + return atomic_add_return(segs + delta, &bucket->id) - segs; } +EXPORT_SYMBOL(ip_idents_reserve); -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs) { - struct net *net = dev_net(dst->dev); - struct inet_peer *peer; + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; - peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); - if (peer) { - iph->id = htons(inet_getid(peer, more)); - inet_putpeer(peer); - return; - } + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); - ip_select_fb_ident(iph); + hash = jhash_3words((__force u32)iph->daddr, + (__force u32)iph->saddr, + iph->protocol, + ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); @@ -700,7 +698,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, out_unlock: spin_unlock_bh(&fnhe_lock); - return; } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, @@ -1003,6 +1000,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, struct flowi4 fl4; struct rtable *rt; + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); @@ -1020,6 +1020,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); @@ -1032,7 +1036,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - struct dst_entry *dst; + struct dst_entry *odst = NULL; bool new = false; bh_lock_sock(sk); @@ -1040,16 +1044,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (!ip_sk_accept_pmtu(sk)) goto out; - rt = (struct rtable *) __sk_dst_get(sk); + odst = sk_dst_get(sk); - if (sock_owned_by_user(sk) || !rt) { + if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - if (!__sk_dst_check(sk, 0)) { + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1059,8 +1064,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); - dst = dst_check(&rt->dst, 0); - if (!dst) { + if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); @@ -1072,10 +1076,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } if (new) - __sk_dst_set(sk, &rt->dst); + sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); + dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); @@ -1139,7 +1144,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1529,7 +1534,7 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *out_dev; unsigned int flags = 0; bool do_cache; - u32 itag; + u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); @@ -1600,6 +1605,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1698,25 +1704,27 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); - if (err != 0) + if (err != 0) { + if (!IN_DEV_FORWARD(in_dev)) + err = -EHOSTUNREACH; goto no_route; - - RT_CACHE_STAT_INC(in_slow_tot); + } if (res.type == RTN_BROADCAST) goto brd_input; if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, - LOOPBACK_IFINDEX, - dev, in_dev, &itag); + 0, dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; @@ -1771,6 +1779,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; @@ -2223,7 +2232,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; new->dev = ort->dst.dev; if (new->dev) @@ -2362,7 +2371,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } else #endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) goto nla_put_failure; } @@ -2473,11 +2482,6 @@ errout_free: goto errout; } -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - return skb->len; -} - void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); @@ -2715,6 +2719,12 @@ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b95331e6c07..c86624b36a6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -121,7 +121,7 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ - diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; @@ -303,6 +303,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->ir_rmt_port = th->source; ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->ir_mark = inet_request_mark(sk, skb); ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -339,7 +340,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 38c8ec90ff6..79a007c5255 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,10 +45,10 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = range[0]; - net->ipv4.sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_seqlock(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = range[0]; + net->ipv4.ip_local_ports.range[1] = range[1]; + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -57,7 +57,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos) { struct net *net = - container_of(table->data, struct net, ipv4.sysctl_local_ports.range); + container_of(table->data, struct net, ipv4.ip_local_ports.range); int ret; int range[2]; struct ctl_table tmp = { @@ -87,14 +87,14 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); + container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } /* Update system visible IP port range */ @@ -102,11 +102,11 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); - write_seqlock(&net->ipv4.sysctl_local_ports.lock); + container_of(table->data, struct net, ipv4.ping_group_range.range); + write_seqlock(&net->ipv4.ip_local_ports.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &ip_ttl_max, }, { - .procname = "ip_no_pmtu_disc", - .data = &ipv4_config.no_pmtu_disc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -444,13 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_local_reserved_ports", - .data = NULL, /* initialized in sysctl_ipv4_init */ - .maxlen = 65536, - .mode = 0644, - .proc_handler = proc_do_large_bitmap, - }, - { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, .maxlen = sizeof(int), @@ -707,7 +693,7 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { + { .procname = "tcp_thin_dupack", .data = &sysctl_tcp_thin_dupack, .maxlen = sizeof(int), @@ -812,7 +798,7 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ping_group_range", - .data = &init_net.ipv4.sysctl_ping_group_range, + .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, @@ -826,11 +812,46 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range), - .data = &init_net.ipv4.sysctl_local_ports.range, + .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), + .data = &init_net.ipv4.ip_local_ports.range, .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = &init_net.ipv4.sysctl_local_reserved_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, + { + .procname = "ip_no_pmtu_disc", + .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_forward_use_pmtu", + .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "fwmark_reflect", + .data = &init_net.ipv4.sysctl_fwmark_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fwmark_accept", + .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -851,26 +872,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[i].data += (void *)net - (void *)&init_net; } - /* - * Sane defaults - nobody may create ping sockets. - * Boot scripts should set this to distro-specific group. - */ - net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); - net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); - - /* - * Set defaults for local port range - */ - seqlock_init(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = 32768; - net->ipv4.sysctl_local_ports.range[1] = 61000; - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; + net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_reserved_ports) + goto err_ports; + return 0; +err_ports: + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); @@ -882,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); @@ -895,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; - struct ctl_table *i; - - for (i = ipv4_table; i->procname; i++) { - if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { - i->data = sysctl_local_reserved_ports; - break; - } - } - if (!i->procname) - return -EINVAL; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0ca87547bec..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -381,13 +381,13 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); + __skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -622,19 +622,21 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) } /* If a not yet filled skb is pushed, do not send it if - * we have packets in Qdisc or NIC queues : + * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize - * than pure acks (dataless) packets, the last check prevents - * autocorking if we only have an ACK in Qdisc/NIC queues. + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && sysctl_tcp_autocorking && + skb != tcp_write_queue_head(sk) && atomic_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -660,7 +662,11 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &tp->tsq_flags); } - return; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. + */ + if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) + return; } if (flags & MSG_MORE) @@ -1038,7 +1044,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, + int *copied, size_t size) { struct tcp_sock *tp = tcp_sk(sk); int err, flags; @@ -1053,11 +1060,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) if (unlikely(tp->fastopen_req == NULL)) return -ENOBUFS; tp->fastopen_req->data = msg; + tp->fastopen_req->size = size; flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, flags); - *size = tp->fastopen_req->copied; + *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); return err; } @@ -1077,7 +1085,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flags = msg->msg_flags; if (flags & MSG_FASTOPEN) { - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) @@ -1100,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); - goto out; + goto out_nopush; } err = -EINVAL; @@ -1274,6 +1282,7 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush: release_sock(sk); return copied + copied_syn; @@ -1660,11 +1669,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && net_dma_find_channel()) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif @@ -2223,7 +2232,7 @@ adjudge_to_death: /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. @@ -2333,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); - tp->srtt = 0; + tp->srtt_us = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; @@ -2777,8 +2786,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; - info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; - info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_rtt = tp->srtt_us >> 3; + info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; @@ -2788,6 +2797,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + + info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? + sk->sk_pacing_rate : ~0ULL; + info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? + sk->sk_max_pacing_rate : ~0ULL; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2903,6 +2917,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; + + case TCP_FASTOPEN: + if (icsk->icsk_accept_queue.fastopenq != NULL) + val = icsk->icsk_accept_queue.fastopenq->max_qlen; + else + val = 0; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 821846fb0a7..d5de69bc04f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -140,13 +140,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index ad37bf18ae4..7b09d8b49fa 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -276,27 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer - */ -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 left; - - if (in_flight >= tp->snd_cwnd) - return true; - - left = tp->snd_cwnd - in_flight; - if (sk_can_gso(sk) && - left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size && - left < sk->sk_gso_max_segs) - return true; - return left <= tcp_max_tso_deferred_mss(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); - /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but @@ -338,11 +317,11 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ @@ -362,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); - struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; /* Initial congestion control used (until SYN) @@ -388,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = { .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 828e4c3ffba..a9bd8a4828a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -304,13 +304,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { @@ -409,7 +408,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; ratio += cnt; - ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); + ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ @@ -476,10 +475,6 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - /* hystart needs ms clock resolution */ - if (hystart && HZ < 1000) - cubictcp.flags |= TCP_CONG_RTT_STAMP; - return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f195d9316e5..9771563ab56 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -72,25 +72,224 @@ error: kfree(ctx); return err; } -/* Computes the fastopen cookie for the IP path. - * The path is a 128 bits long (pad with zeros for IPv4). - * - * The caller must check foc->len to determine if a valid cookie - * has been generated successfully. -*/ -void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, - struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen(const void *path, + struct tcp_fastopen_cookie *foc) { - __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; + bool ok = false; tcp_fastopen_init_key_once(true); rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); + crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; + ok = true; } rcu_read_unlock(); + return ok; +} + +/* Generate the fastopen cookie by doing aes128 encryption on both + * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 + * addresses. For the longer IPv6 addresses use CBC-MAC. + * + * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. + */ +static bool tcp_fastopen_cookie_gen(struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + if (req->rsk_ops->family == AF_INET) { + const struct iphdr *iph = ip_hdr(syn); + + __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; + return __tcp_fastopen_cookie_gen(path, foc); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (req->rsk_ops->family == AF_INET6) { + const struct ipv6hdr *ip6h = ipv6_hdr(syn); + struct tcp_fastopen_cookie tmp; + + if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + struct in6_addr *buf = (struct in6_addr *) tmp.val; + int i = 4; + + for (i = 0; i < 4; i++) + buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; + return __tcp_fastopen_cookie_gen(buf, foc); + } + } +#endif + return false; +} + +static bool tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) +{ + struct tcp_sock *tp; + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + struct sock *child; + + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + return false; + + spin_lock(&queue->fastopenq->lock); + queue->fastopenq->qlen++; + spin_unlock(&queue->fastopenq->lock); + + /* Initialize the child socket. Have to fix some values to take + * into account the child is a Fast Open socket and is created + * only out of the bits carried in the SYN packet. + */ + tp = tcp_sk(child); + + tp->fastopen_rsk = req; + /* Do a hold on the listner sk so that if the listener is being + * closed, the child that has been accepted can live on and still + * access listen_lock. + */ + sock_hold(sk); + tcp_rsk(req)->listener = sk; + + /* RFC1323: The window in SYN & SYN/ACK segments is never + * scaled. So correct it appropriately. + */ + tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + + /* Activate the retrans timer so that SYNACK can be retransmitted. + * The request socket is not added to the SYN table of the parent + * because it's been added to the accept queue directly. + */ + inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); + + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, child); + + /* Now finish processing the fastopen child socket. */ + inet_csk(child)->icsk_af_ops->rebuild_header(child); + tcp_init_congestion_control(child); + tcp_mtup_init(child); + tcp_init_metrics(child); + tcp_init_buffer_space(child); + + /* Queue the data carried in the SYN packet. We need to first + * bump skb's refcnt because the caller will attempt to free it. + * + * XXX (TFO) - we honor a zero-payload TFO request for now, + * (any reason not to?) but no need to queue the skb since + * there is no data. How about SYN+FIN? + */ + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { + skb = skb_get(skb); + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + skb_set_owner_r(skb, child); + __skb_queue_tail(&child->sk_receive_queue, skb); + tp->syn_data_acked = 1; + } + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + sk->sk_data_ready(sk); + bh_unlock_sock(child); + sock_put(child); + WARN_ON(req->sk == NULL); + return true; +} +EXPORT_SYMBOL(tcp_fastopen_create_child); + +static bool tcp_fastopen_queue_check(struct sock *sk) +{ + struct fastopen_queue *fastopenq; + + /* Make sure the listener has enabled fastopen, and we don't + * exceed the max # of pending TFO requests allowed before trying + * to validating the cookie in order to avoid burning CPU cycles + * unnecessarily. + * + * XXX (TFO) - The implication of checking the max_qlen before + * processing a cookie request is that clients can't differentiate + * between qlen overflow causing Fast Open to be disabled + * temporarily vs a server not supporting Fast Open at all. + */ + fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq == NULL || fastopenq->max_qlen == 0) + return false; + + if (fastopenq->qlen >= fastopenq->max_qlen) { + struct request_sock *req1; + spin_lock(&fastopenq->lock); + req1 = fastopenq->rskq_rst_head; + if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + spin_unlock(&fastopenq->lock); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); + return false; + } + fastopenq->rskq_rst_head = req1->dl_next; + fastopenq->qlen--; + spin_unlock(&fastopenq->lock); + reqsk_free(req1); + } + return true; +} + +/* Returns true if we should perform Fast Open on the SYN. The cookie (foc) + * may be updated and return the client in the SYN-ACK later. E.g., Fast Open + * cookie request (foc->len == 0). + */ +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) +{ + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; + bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + + if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + (syn_data || foc->len >= 0) && + tcp_fastopen_queue_check(sk))) { + foc->len = -1; + return false; + } + + if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + goto fastopen; + + if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + foc->len == TCP_FASTOPEN_COOKIE_SIZE && + foc->len == valid_foc.len && + !memcmp(foc->val, valid_foc.val, foc->len)) { + /* Cookie is valid. Create a (full) child socket to accept + * the data in SYN before returning a SYN-ACK to ack the + * data. If we fail to create the socket, fall back and + * ack the ISN only but includes the same cookie. + * + * Note: Data-less SYN with valid cookie is allowed to send + * data in SYN_RECV state. + */ +fastopen: + if (tcp_fastopen_create_child(sk, skb, dst, req)) { + foc->len = -1; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return true; + } + } + + NET_INC_STATS_BH(sock_net(sk), foc->len ? + LINUX_MIB_TCPFASTOPENPASSIVEFAIL : + LINUX_MIB_TCPFASTOPENCOOKIEREQD); + *foc = valid_foc; + return false; } +EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8ed9305dfdf..1c4908280d9 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,12 +109,12 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) @@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "highspeed" diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a194acfd92..031361311a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,12 +227,12 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 478fe82611b..d8f8f05a495 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -21,7 +21,7 @@ struct hybla { u32 rho2; /* Rho * Rho, integer part */ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ - u32 minrtt; /* Minimum smoothed round trip time value seen */ + u32 minrtt_us; /* Minimum smoothed round trip time value seen */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ @@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk) { struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, + tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), + 8U); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >> 7; @@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk) hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; tp->snd_cwnd = ca->rho; } @@ -85,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -94,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ - if (tp->srtt < ca->minrtt){ + if (tp->srtt_us < ca->minrtt_us) { hybla_recalc_param(sk); - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; } - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (!ca->hybla_en) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -166,7 +167,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 8a520996f3d..5999b3972e6 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -23,7 +23,6 @@ #define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */ #define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */ #define ALPHA_BASE ALPHA_SCALE /* 1.0 */ -#define U32_MAX ((u32)~0U) #define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */ #define BETA_SHIFT 6 @@ -256,8 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); @@ -266,7 +264,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, update_params(sk); /* RFC2861 only increase cwnd if fully utilized */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In slow start */ @@ -326,10 +324,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } static struct tcp_congestion_ops tcp_illinois __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c53b7f35c51..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -667,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); - long m = mrtt; /* RTT */ + long m = mrtt_us; /* RTT */ + u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -688,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -707,27 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (m > 0) m >>= 3; } else { - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ } - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev > tp->mdev_max) { - tp->mdev_max = tp->mdev; - if (tp->mdev_max > tp->rttvar) - tp->rttvar = tp->mdev_max; + tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev_us > tp->mdev_max_us) { + tp->mdev_max_us = tp->mdev_us; + if (tp->mdev_max_us > tp->rttvar_us) + tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + if (tp->mdev_max_us < tp->rttvar_us) + tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = tcp_rto_min(sk); + tp->mdev_max_us = tcp_rto_min_us(sk); } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev = m << 1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ + tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); + tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; } + tp->srtt_us = max(1U, srtt); } /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -742,18 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk) u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (HZ << 3); + rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); rate *= max(tp->snd_cwnd, tp->packets_out); - /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), - * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) - * We probably need usec resolution in the future. - * Note: This also takes care of possible srtt=0 case, - * when tcp_rtt_estimator() was not yet called. - */ - if (tp->srtt > 8 + 2) - do_div(rate, tp->srtt); + if (likely(tp->srtt_us)) + do_div(rate, tp->srtt_us); /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store @@ -766,7 +761,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1111,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1120,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; - int flag; - s32 rtt; /* RTT measured by SACKing never-retransmitted data */ + int reord; + int fack_count; + long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + int flag; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1167,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; } - err = tcp_fragment(sk, skb, pkt_len, mss); + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1184,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, u32 xmit_time) + int dup_sack, int pcount, + const struct skb_mstamp *xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1225,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk, if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; /* Pick the earliest sequence sacked for RTT */ - if (state->rtt < 0) - state->rtt = tcp_time_stamp - xmit_time; + if (state->rtt_us < 0) { + struct skb_mstamp now; + + skb_mstamp_get(&now); + state->rtt_us = skb_mstamp_us_delta(&now, + xmit_time); + } } if (sacked & TCPCB_LOST) { @@ -1285,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1563,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1620,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, s32 *sack_rtt) + u32 prior_snd_una, long *sack_rtt_us) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1638,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; - state.rtt = -1; + state.rtt_us = -1L; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1822,7 +1823,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt = state.rtt; + *sack_rtt_us = state.rtt_us; return state.flag; } @@ -1892,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1943,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2032,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * available, or RTO is scheduled to fire first. */ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt) + (flag & FLAG_ECE) || !tp->srtt_us) return false; - delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + delay = max(usecs_to_jiffies(tp->srtt_us >> 5), + msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; @@ -2237,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = skb_shinfo(skb)->gso_size; - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, + mss, GFP_ATOMIC); if (err < 0) break; cnt = packets; @@ -2660,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tp->retrans_out ? : -1; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) @@ -2680,13 +2685,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) bool recovered = !before(tp->snd_una, tp->high_seq); if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - if (flag & FLAG_ORIG_SACK_ACKED) { - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - tcp_try_undo_loss(sk, true); + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) return; - } + if (after(tp->snd_nxt, tp->high_seq) && (flag & FLAG_DATA_SACKED || is_dupack)) { tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2882,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - s32 seq_rtt, s32 sack_rtt) + long seq_rtt_us, long sack_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2892,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * is acked (RFC6298). */ if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt = -1; + seq_rtt_us = -1L; - if (seq_rtt < 0) - seq_rtt = sack_rtt; + if (seq_rtt_us < 0) + seq_rtt_us = sack_rtt_us; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -2903,14 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - if (seq_rtt < 0) + if (seq_rtt_us < 0) return false; - tcp_rtt_estimator(sk, seq_rtt); + tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ @@ -2922,22 +2926,23 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); - s32 seq_rtt = -1; + long seq_rtt_us = -1L; if (synack_stamp && !tp->total_retrans) - seq_rtt = tcp_time_stamp - synack_stamp; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() */ - if (!tp->srtt) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (!tp->srtt_us) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -3020,26 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, s32 sack_rtt) + u32 prior_snd_una, long sack_rtt_us) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - u32 now = tcp_time_stamp; + struct skb_mstamp first_ackt, last_ackt, now; + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; bool fully_acked = true; - int flag = 0; + long ca_seq_rtt_us = -1L; + long seq_rtt_us = -1L; + struct sk_buff *skb; u32 pkts_acked = 0; - u32 reord = tp->packets_out; - u32 prior_sacked = tp->sacked_out; - s32 seq_rtt = -1; - s32 ca_seq_rtt = -1; - ktime_t last_ackt = net_invalid_timestamp(); bool rtt_update; + int flag = 0; + + first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 acked_pcount; u8 sacked = scb->sacked; + u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3061,11 +3067,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } + last_ackt = skb->skb_mstamp; + WARN_ON_ONCE(last_ackt.v64 == 0); + if (!first_ackt.v64) + first_ackt = last_ackt; + if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) @@ -3111,7 +3117,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); + skb_mstamp_get(&now); + if (first_ackt.v64) { + seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); + ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops @@ -3139,25 +3151,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - s32 rtt_us = -1; - - /* Is the ACK triggering packet unambiguous? */ - if (!(flag & FLAG_RETRANS_DATA_ACKED)) { - /* High resolution needed and available? */ - if (ca_ops->flags & TCP_CONG_RTT_STAMP && - !ktime_equal(last_ackt, - net_invalid_timestamp())) - rtt_us = ktime_us_delta(ktime_get_real(), - last_ackt); - else if (ca_seq_rtt >= 0) - rtt_us = jiffies_to_usecs(ca_seq_rtt); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt >= 0 && - sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + } else if (skb && rtt_update && sack_rtt_us >= 0 && + sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3367,12 +3365,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; u32 prior_fackets; int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - s32 sack_rtt = -1; + long sack_rtt_us = -1L; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3400,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= FLAG_SND_UNA_ADVANCED; prior_fackets = tp->fackets_out; - prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3430,7 +3426,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3449,12 +3445,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + sack_rtt_us); acked -= tp->packets_out; /* Advance cwnd if state allows */ if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked, prior_in_flight); + tcp_cong_avoid(sk, ack, acked); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3472,8 +3469,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) - tcp_update_pacing_rate(sk); + tcp_update_pacing_rate(sk); return 1; no_queue: @@ -3502,7 +3498,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -3686,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int opcode = *ptr++; int opsize; - switch(opcode) { + switch (opcode) { case TCPOPT_EOL: return NULL; case TCPOPT_NOP: @@ -4046,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ - for (i=this_sack+1; i < num_sacks; i++) + for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; @@ -4416,7 +4412,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } @@ -4706,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - /* Limited by application or receiver window. */ - u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; - } - tp->snd_cwnd_used = 0; - } - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4917,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -5003,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } out: return copied_early; @@ -5278,7 +5252,7 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } } @@ -5398,9 +5372,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, break; } tcp_rearm_rto(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; + if (tp->syn_data_acked) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); return false; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 06721392475..77cccda1ad0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -173,7 +173,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - orig_sport, orig_dport, sk, true); + orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) @@ -336,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; - struct request_sock *req; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; __u32 remaining; int err; struct net *net = dev_net(icmp_skb->dev); @@ -378,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - req = tp->fastopen_rsk; seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt) && - (req == NULL || seq != tcp_rsk(req)->snt_isn)) { - /* For a Fast Open socket, allow seq to be snt_isn. */ + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -426,16 +426,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) break; if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff) + !icsk->icsk_backoff || fastopen) break; - /* XXX (TFO) - revisit the following logic for TFO */ - if (sock_owned_by_user(sk)) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : + inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); @@ -462,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - /* XXX (TFO) - if it's a TFO socket and has been accepted, rather - * than following the TCP_SYN_RECV case and closing the socket, - * we ignore the ICMP error and keep trying like a fully established - * socket. Is this the right thing to do? - */ - if (req && req->sk == NULL) - goto out; - switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: @@ -502,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can f.e. if SYNs crossed, - or Fast Open. - */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; @@ -822,18 +815,19 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - u16 queue_mapping) + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; - struct sk_buff * skb; + struct sk_buff *skb; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, NULL); + skb = tcp_make_synack(sk, dst, req, foc); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -852,10 +846,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, 0); + int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); - if (!res) + if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } return res; } @@ -878,8 +874,6 @@ bool tcp_syn_flood_action(struct sock *sk, bool want_cookie = false; struct listen_sock *lopt; - - #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { msg = "Sending cookies"; @@ -1260,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { }; #endif -static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct tcp_fastopen_cookie *valid_foc) -{ - bool skip_cookie = false; - struct fastopen_queue *fastopenq; - - if (likely(!fastopen_cookie_present(foc))) { - /* See include/net/tcp.h for the meaning of these knobs */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || - ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) - skip_cookie = true; /* no cookie to validate */ - else - return false; - } - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - /* A FO option is present; bump the counter. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - - /* Make sure the listener has enabled fastopen, and we don't - * exceed the max # of pending TFO requests allowed before trying - * to validating the cookie in order to avoid burning CPU cycles - * unnecessarily. - * - * XXX (TFO) - The implication of checking the max_qlen before - * processing a cookie request is that clients can't differentiate - * between qlen overflow causing Fast Open to be disabled - * temporarily vs a server not supporting Fast Open at all. - */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || - fastopenq == NULL || fastopenq->max_qlen == 0) - return false; - - if (fastopenq->qlen >= fastopenq->max_qlen) { - struct request_sock *req1; - spin_lock(&fastopenq->lock); - req1 = fastopenq->rskq_rst_head; - if ((req1 == NULL) || time_after(req1->expires, jiffies)) { - spin_unlock(&fastopenq->lock); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); - /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ - foc->len = -1; - return false; - } - fastopenq->rskq_rst_head = req1->dl_next; - fastopenq->qlen--; - spin_unlock(&fastopenq->lock); - reqsk_free(req1); - } - if (skip_cookie) { - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; - } - - if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { - if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || - memcmp(&foc->val[0], &valid_foc->val[0], - TCP_FASTOPEN_COOKIE_SIZE) != 0) - return false; - valid_foc->len = -1; - } - /* Acknowledge the data received from the peer. */ - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; - } else if (foc->len == 0) { /* Client requesting a cookie */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENCOOKIEREQD); - } else { - /* Client sent a cookie with wrong size. Treat it - * the same as invalid and return a valid one. - */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - } - return false; -} - -static int tcp_v4_conn_req_fastopen(struct sock *sk, - struct sk_buff *skb, - struct sk_buff *skb_synack, - struct request_sock *req) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; - const struct inet_request_sock *ireq = inet_rsk(req); - struct sock *child; - int err; - - req->num_retrans = 0; - req->num_timeout = 0; - req->sk = NULL; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - kfree_skb(skb_synack); - return -1; - } - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); - if (!err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; - /* XXX (TFO) - is it ok to ignore error and continue? */ - - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); - - /* Initialize the child socket. Have to fix some values to take - * into account the child is a Fast Open socket and is created - * only out of the bits carried in the SYN packet. - */ - tp = tcp_sk(child); - - tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; - - /* RFC1323: The window in SYN & SYN/ACK segments is never - * scaled. So correct it appropriately. - */ - tp->snd_wnd = ntohs(tcp_hdr(skb)->window); - - /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent - * because it's been added to the accept queue directly. - */ - inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); - - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); - - /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_init_buffer_space(child); - - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now. - * (Any reason not to?) - */ - if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { - /* Don't queue the skb if there is no payload in SYN. - * XXX (TFO) - How about SYN+FIN? - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - } else { - skb = skb_get(skb); - skb_dst_drop(skb); - __skb_pull(skb, tcp_hdr(skb)->doff * 4); - skb_set_owner_r(skb, child); - __skb_queue_tail(&child->sk_receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - tp->syn_data_acked = 1; - } - sk->sk_data_ready(sk, 0); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(req->sk == NULL); - return 0; -} - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; @@ -1451,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false; + bool want_cookie = false, fastopen; struct flowi4 fl4; struct tcp_fastopen_cookie foc = { .len = -1 }; - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; - struct sk_buff *skb_synack; - int do_fastopen; + int err; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1507,6 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(skb); + ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -1555,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } - tcp_rsk(req)->snt_isn = isn; - - if (dst == NULL) { - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto drop_and_free; - } - do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); - - /* We don't call tcp_v4_send_synack() directly because we need - * to make sure a child socket can be created successfully before - * sending back synack! - * - * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() - * (or better yet, call tcp_send_synack() in the child context - * directly, but will have to fix bunch of other code first) - * after syn_recv_sock() except one will need to first fix the - * latter to remove its dependency on the current implementation - * of tcp_v4_send_synack()->tcp_select_initial_window(). - */ - skb_synack = tcp_make_synack(sk, dst, req, - fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); - - if (skb_synack) { - __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); - } else + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) goto drop_and_free; - if (likely(!do_fastopen)) { - int err; - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v4_send_synack(sk, dst, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { if (err || want_cookie) goto drop_and_free; tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; - /* Add the request_sock to the SYN table */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - if (fastopen_cookie_present(&foc) && foc.len != 0) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) - goto drop_and_free; + } return 0; @@ -1668,7 +1452,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } sk_setup_caps(newsk, dst); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && @@ -1745,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v4_check(skb->len, iph->saddr, - iph->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, IPPROTO_TCP, 0); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - - /* The socket must have it's spinlock held when we get * here. * @@ -1961,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb) * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ - if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) + + if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; th = tcp_hdr(skb); @@ -2629,7 +2391,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, { __be32 dest, src; __u16 destp, srcp; - long delta = tw->tw_ttd - jiffies; + s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 991d62a2f9b..1e70fa8fa79 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } /** @@ -315,11 +314,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) } static struct tcp_congestion_ops tcp_lp __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7e522c558b..f7a2ec3ac58 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -102,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned long long val; int ret = 0; - switch (cft->private) { + buf = strstrip(buf); + + switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = tcp_update_limit(memcg, val); @@ -121,7 +123,7 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) @@ -168,17 +170,18 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_css(css); + memcg = mem_cgroup_from_css(of_css(of)); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; + return nbytes; - switch (event) { + switch (of_cft(of)->private) { case RES_MAX_USAGE: res_counter_reset_max(&cg_proto->memory_allocated); break; @@ -187,13 +190,13 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", - .write_string = tcp_cgroup_write, + .write = tcp_cgroup_write, .read_u64 = tcp_cgroup_read, .private = RES_LIMIT, }, @@ -205,13 +208,13 @@ static struct cftype tcp_files[] = { { .name = "kmem.tcp.failcnt", .private = RES_FAILCNT, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { .name = "kmem.tcp.max_usage_in_bytes", .private = RES_MAX_USAGE, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { } /* terminate */ @@ -219,7 +222,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 06493736fbc..4fe04180598 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -22,6 +22,10 @@ int sysctl_tcp_nometrics_save __read_mostly; +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, + struct net *net, unsigned int hash); + struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10; /* Recurring Fast Open SYN losses */ @@ -29,14 +33,20 @@ struct tcp_fastopen_metrics { struct tcp_fastopen_cookie cookie; }; +/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility + * Kernel only stores RTT and RTTVAR in usec resolution + */ +#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) + struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; - struct inetpeer_addr tcpm_addr; + struct inetpeer_addr tcpm_saddr; + struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_ts; u32 tcpm_ts_stamp; u32 tcpm_lock; - u32 tcpm_vals[TCP_METRIC_MAX + 1]; + u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; struct rcu_head rcu_head; @@ -54,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm, return tm->tcpm_vals[idx]; } -static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, - enum tcp_metric_index idx) -{ - return msecs_to_jiffies(tm->tcpm_vals[idx]); -} - static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) @@ -67,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, tm->tcpm_vals[idx] = val; } -static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, - enum tcp_metric_index idx, - u32 val) -{ - tm->tcpm_vals[idx] = jiffies_to_msecs(val); -} - static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { @@ -96,9 +93,11 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, +static void tcpm_suck_dst(struct tcp_metrics_block *tm, + const struct dst_entry *dst, bool fastopen_clear) { + u32 msval; u32 val; tm->tcpm_stamp = jiffies; @@ -116,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, val |= 1 << TCP_METRIC_REORDERING; tm->tcpm_lock = val; - tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); - tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); + msval = dst_metric_raw(dst, RTAX_RTT); + tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + + msval = dst_metric_raw(dst, RTAX_RTTVAR); + tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); @@ -130,16 +132,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, } } +#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + tcpm_suck_dst(tm, dst, false); +} + +#define TCP_METRICS_RECLAIM_DEPTH 5 +#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, - struct inetpeer_addr *addr, - unsigned int hash, - bool reclaim) + struct inetpeer_addr *saddr, + struct inetpeer_addr *daddr, + unsigned int hash) { struct tcp_metrics_block *tm; struct net *net; + bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); net = dev_net(dst->dev); + + /* While waiting for the spin-lock the cache might have been populated + * with this entry and so we have to check again. + */ + tm = __tcp_get_metrics(saddr, daddr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) { + reclaim = true; + tm = NULL; + } + if (tm) { + tcpm_check_stamp(tm, dst); + goto out_unlock; + } + if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; @@ -155,7 +183,8 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } - tm->tcpm_addr = *addr; + tm->tcpm_saddr = *saddr; + tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, true); @@ -169,17 +198,6 @@ out_unlock: return tm; } -#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) - -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) -{ - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst, false); -} - -#define TCP_METRICS_RECLAIM_DEPTH 5 -#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL - static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) { if (tm) @@ -189,7 +207,8 @@ static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, in return NULL; } -static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; @@ -197,7 +216,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *a for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, addr)) + if (addr_same(&tm->tcpm_saddr, saddr) && + addr_same(&tm->tcpm_daddr, daddr)) break; depth++; } @@ -208,19 +228,22 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; - addr.family = req->rsk_ops->family; - switch (addr.family) { + saddr.family = req->rsk_ops->family; + daddr.family = req->rsk_ops->family; + switch (daddr.family) { case AF_INET: - addr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) addr.addr.a4; + saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; + daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; + hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; + *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -233,7 +256,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) break; } tcpm_check_stamp(tm, dst); @@ -243,32 +267,44 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; - addr.family = tw->tw_family; - switch (addr.family) { - case AF_INET: - addr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) addr.addr.a4; - break; + if (tw->tw_family == AF_INET) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } #if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; - hash = ipv6_addr_hash(&tw->tw_v6_daddr); - break; + else if (tw->tw_family == AF_INET6) { + if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; + hash = ipv6_addr_hash(&tw->tw_v6_daddr); + } + } #endif - default: + else return NULL; - } net = twsk_net(tw); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) break; } return tm; @@ -279,38 +315,45 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, bool create) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; - bool reclaim; - addr.family = sk->sk_family; - switch (addr.family) { - case AF_INET: - addr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) addr.addr.a4; - break; + if (sk->sk_family == AF_INET) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } #if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; - hash = ipv6_addr_hash(&sk->sk_v6_daddr); - break; + else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; + hash = ipv6_addr_hash(&sk->sk_v6_daddr); + } + } #endif - default: + else return NULL; - } net = dev_net(dst->dev); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - tm = __tcp_get_metrics(&addr, net, hash); - reclaim = false; - if (tm == TCP_METRICS_RECLAIM_PTR) { - reclaim = true; + tm = __tcp_get_metrics(&saddr, &daddr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) tm = NULL; - } if (!tm && create) - tm = tcpm_new(dst, &addr, hash, reclaim); + tm = tcpm_new(dst, &saddr, &daddr, hash); else tcpm_check_stamp(tm, dst); @@ -338,7 +381,7 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); rcu_read_lock(); - if (icsk->icsk_backoff || !tp->srtt) { + if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. @@ -353,8 +396,8 @@ void tcp_update_metrics(struct sock *sk) if (!tm) goto out_unlock; - rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); - m = rtt - tp->srtt; + rtt = tcp_metric_get(tm, TCP_METRIC_RTT); + m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is @@ -362,10 +405,10 @@ void tcp_update_metrics(struct sock *sk) */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) - rtt = tp->srtt; + rtt = tp->srtt_us; else rtt -= (m >> 3); - tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); + tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { @@ -376,16 +419,16 @@ void tcp_update_metrics(struct sock *sk) /* Scale deviation to rttvar fixed point */ m >>= 1; - if (m < tp->mdev) - m = tp->mdev; + if (m < tp->mdev_us) + m = tp->mdev_us; - var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; - tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); + tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); } if (tcp_in_initial_slowstart(tp)) { @@ -482,7 +525,7 @@ void tcp_init_metrics(struct sock *sk) tp->reordering = val; } - crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); + crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal @@ -505,18 +548,20 @@ reset: * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (crtt > tp->srtt) { + if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt >>= 3; + crtt /= 8 * USEC_PER_MSEC; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); - } else if (tp->srtt == 0) { + } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission. */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); + tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been @@ -724,15 +769,21 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, struct nlattr *nest; int i; - switch (tm->tcpm_addr.family) { + switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_addr.addr.a4) < 0) + tm->tcpm_daddr.addr.a4) < 0) + goto nla_put_failure; + if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, + tm->tcpm_saddr.addr.a4) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, - tm->tcpm_addr.addr.a6) < 0) + tm->tcpm_daddr.addr.a6) < 0) + goto nla_put_failure; + if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, + tm->tcpm_saddr.addr.a6) < 0) goto nla_put_failure; break; default: @@ -757,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; - for (i = 0; i < TCP_METRIC_MAX + 1; i++) { - if (!tm->tcpm_vals[i]) + for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { + u32 val = tm->tcpm_vals[i]; + + if (!val) continue; - if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) + if (i == TCP_METRIC_RTT) { + if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (i == TCP_METRIC_RTTVAR) { + if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure; n++; } @@ -855,44 +922,66 @@ done: return skb->len; } -static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, - unsigned int *hash, int optional) +static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional, int v4, int v6) { struct nlattr *a; - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; + a = info->attrs[v4]; if (a) { addr->family = AF_INET; addr->addr.a4 = nla_get_be32(a); - *hash = (__force unsigned int) addr->addr.a4; + if (hash) + *hash = (__force unsigned int) addr->addr.a4; return 0; } - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; + a = info->attrs[v6]; if (a) { if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; addr->family = AF_INET6; memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); - *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + if (hash) + *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); return 0; } return optional ? 1 : -EAFNOSUPPORT; } +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional) +{ + return __parse_nl_addr(info, addr, hash, optional, + TCP_METRICS_ATTR_ADDR_IPV4, + TCP_METRICS_ATTR_ADDR_IPV6); +} + +static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) +{ + return __parse_nl_addr(info, addr, NULL, 0, + TCP_METRICS_ATTR_SADDR_IPV4, + TCP_METRICS_ATTR_SADDR_IPV6); +} + static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; + bool src = true; - ret = parse_nl_addr(info, &addr, &hash, 0); + ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -907,7 +996,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -962,32 +1052,38 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; + bool src = true, found = false; - ret = parse_nl_addr(info, &addr, &hash, 1); + ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) return tcp_metrics_flush_all(net); + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hb = net->ipv4.tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); - for (tm = deref_locked_genl(*pp); tm; - pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { - if (addr_same(&tm->tcpm_addr, &addr)) { + for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { *pp = tm->tcpm_next; - break; + kfree_rcu(tm, rcu_head); + found = true; + } else { + pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); - if (!tm) + if (!found) return -ESRCH; - kfree_rcu(tm, rcu_head); return 0; } @@ -1063,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net) tm = next; } } - if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) - vfree(net->ipv4.tcp_metrics_hash); - else - kfree(net->ipv4.tcp_metrics_hash); + kvfree(net->ipv4.tcp_metrics_hash); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 97b68415986..e68e0d4af6c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -297,6 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; + tw->tw_flowlabel = np->flow_label >> 12; tw->tw_ipv6only = np->ipv6only; } #endif @@ -361,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, + struct sock *sk, struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + int mss = dst_metric_advmss(dst); + + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); + static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, struct request_sock *req) { @@ -397,8 +429,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->rcv_isn); - newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -425,7 +457,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); - skb_queue_head_init(&newtp->out_of_order_queue); + __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; @@ -744,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 05606353c7e..55046ecd083 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -57,10 +57,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; @@ -97,9 +99,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->check = newcheck; if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = - csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); + th->check = gso_make_checksum(skb, ~th->check); seq += mss; if (copy_destructor) { @@ -133,12 +133,10 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); + th->check = gso_make_checksum(skb, ~th->check); out: return segs; } -EXPORT_SYMBOL(tcp_gso_segment); struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -197,7 +195,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto out_check_final; found: - flush = NAPI_GRO_CB(p)->flush; + /* Include the IP ID check below from the inner most IP hdr */ + flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); @@ -230,17 +229,16 @@ out_check_final: pp = head; out: - NAPI_GRO_CB(skb)->flush |= flush; + NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; } -EXPORT_SYMBOL(tcp_gro_receive); int tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); - skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; @@ -272,6 +270,7 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb) static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { + /* Use the IP hdr immediately proceeding for this transport */ const struct iphdr *iph = skb_gro_network_header(skb); __wsum wsum; @@ -279,7 +278,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff * if (NAPI_GRO_CB(skb)->flush) goto skip_csum; - wsum = skb->csum; + wsum = NAPI_GRO_CB(skb)->csum; switch (skb->ip_summed) { case CHECKSUM_NONE: @@ -303,14 +302,14 @@ skip_csum: return tcp_gro_receive(head, skb); } -static int tcp4_gro_complete(struct sk_buff *skb) +static int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), - iph->saddr, iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, + iph->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; return tcp_gro_complete(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 993da005e08..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { tcp_rearm_rto(sk); } + + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, + tcp_skb_pcount(skb)); } /* SND.NXT, if window was not shrunk. @@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); @@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; @@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ - if (new_win == 0) + if (new_win == 0) { tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } return new_win; } @@ -408,7 +421,7 @@ struct tcp_out_options { * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). @@ -614,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - if (foc != NULL) { + if (foc != NULL && foc->len >= 0) { u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { @@ -681,7 +694,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. - * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. @@ -698,12 +711,13 @@ static void tcp_tsq_handler(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); } /* - * One tasklest per cpu tries to send more skbs. + * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when - * transfering tsq->head because tcp_wfree() might + * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(unsigned long data) @@ -766,6 +780,17 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); + /* Here begins the tricky part : + * We are called from release_sock() with : + * 1) BH disabled + * 2) sk_lock.slock spinlock held + * 3) socket owned by us (sk->sk_lock.owned == 1) + * + * But following code is meant to be called from BH handlers, + * so we should keep BH disabled, but early release socket ownership + */ + sock_release_ownership(sk); + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); __sock_put(sk); @@ -797,7 +822,7 @@ void __init tcp_tasklet_init(void) /* * Write buffer destructor automatically called from kfree_skb. - * We cant xmit new skbs from this context, as we might already + * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) @@ -853,18 +878,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); if (clone_it) { - const struct sk_buff *fclone = skb + 1; - - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + skb_mstamp_get(&skb->skb_mstamp); if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -872,6 +886,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -958,7 +974,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -1058,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -1073,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, gfp); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ @@ -1364,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out >= tp->snd_cwnd) { + /* Track the maximum number of outstanding packets in each + * window, and remember whether we were cwnd-limited then. + */ + if (!before(tp->snd_una, tp->max_packets_seq) || + tp->packets_out > tp->max_packets_out) { + tp->max_packets_out = tp->packets_out; + tp->max_packets_seq = tp->snd_nxt; + tp->is_cwnd_limited = is_cwnd_limited; + } + + if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1384,23 +1431,51 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int max_segs) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) +{ + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, + int nonagle) +{ + return partial && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, max_len; + u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; @@ -1413,7 +1488,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b if (max_len <= needed) return max_len; - return needed - needed % mss_now; + partial = needed % mss_now; + /* If last segment is not a full MSS, check if Nagle rules allow us + * to include this last segment in this skb. + * Otherwise, we'll split the skb at last MSS boundary + */ + if (tcp_nagle_check(partial != 0, tp, nonagle)) + return needed - partial; + + return needed; } /* Can at least one segment of SKB be sent right now, according to the @@ -1453,28 +1536,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int mss_now, int nonagle) -{ - return skb->len < mss_now && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1495,7 +1556,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; @@ -1564,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now); + return tcp_fragment(sk, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) @@ -1607,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + bool *is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1671,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (!tp->tso_deferred) tp->tso_deferred = 1 | (jiffies << 1); + if (cong_win < send_win && cong_win < skb->len) + *is_cwnd_limited = true; + return true; send_now: @@ -1831,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + bool is_cwnd_limited = false; sent_pkts = 0; @@ -1855,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { + is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -1871,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) break; } @@ -1890,7 +1958,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); - break; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + */ + smp_mb__after_atomic(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + break; } limit = mss_now; @@ -1898,7 +1972,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, - sk->sk_gso_max_segs)); + sk->sk_gso_max_segs), + nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1929,7 +2004,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); - tcp_cwnd_validate(sk); + tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); @@ -1940,7 +2015,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, tlp_time_stamp, rto_time_stamp; - u32 rtt = tp->srtt >> 3; + u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) return false; @@ -1962,7 +2037,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -1993,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk) return true; } +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } + return false; +} + /* When probe timeout (PTO) fires, send a new segment if one exists, else * retransmit the last segment. */ @@ -2018,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb)) goto rearm_timer; + if (skb_still_in_host_queue(sk, skb)) + goto rearm_timer; + pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + GFP_ATOMIC))) goto rearm_timer; skb = tcp_write_queue_tail(sk); } @@ -2031,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - /* Probe with zero data doesn't trigger fast recovery. */ - if (skb->len > 0) - err = __tcp_retransmit_skb(sk, skb); + err = __tcp_retransmit_skb(sk, skb); /* Record snd_nxt for loss detection. */ if (likely(!err)) @@ -2047,7 +2143,6 @@ rearm_timer: if (likely(!err)) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); - return; } /* Push out any pending frames which were held back due to @@ -2145,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk) */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); - int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int allowed_space = tcp_full_space(sk); + int full_space = min_t(int, tp->window_clamp, allowed_space); int window; if (mss > full_space) @@ -2158,7 +2254,19 @@ u32 __tcp_select_window(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); - if (free_space < mss) + /* free_space might become our new window, make sure we don't + * increase it due to wscale. + */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + /* if free space is less than mss estimate, or is below 1/16th + * of the maximum allowed, try to move to zero-window, else + * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and + * new incoming data is dropped due to memory limits. + * With large window, mss test triggers way too late in order + * to announce zero window in time before rmem limit kicks in. + */ + if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } @@ -2313,6 +2421,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss; + int err; /* Inconslusive MTU probe */ if (icsk->icsk_mtup.probe_size) { @@ -2326,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; + if (skb_still_in_host_queue(sk, skb)) + return -EBUSY; + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); @@ -2348,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -EAGAIN; if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss)) + if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { int oldpcount = tcp_skb_pcount(skb); @@ -2376,11 +2488,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; } else { - return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + + if (likely(!err)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans++; + } + return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) @@ -2389,11 +2511,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err = __tcp_retransmit_skb(sk, skb); if (err == 0) { - /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - - tp->total_retrans++; - #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); @@ -2408,15 +2525,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans += tcp_skb_pcount(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; - } else { + } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } + + if (tp->undo_retrans < 0) + tp->undo_retrans = 0; + tp->undo_retrans += tcp_skb_pcount(skb); return err; } @@ -2677,7 +2796,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2692,27 +2811,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - - /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); - - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; - } - memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) @@ -2747,7 +2845,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2762,7 +2860,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2884,7 +2982,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; - syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + space = min_t(size_t, space, fo->size); + + /* limit to order-0 allocations */ + space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + + syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, sk->sk_allocation); if (syn_data == NULL) goto fallback; @@ -2914,9 +3017,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) tcp_connect_queue_skb(sk, data); fo->copied = data->len; + /* syn_data is about to be sent, we need to take current time stamps + * for the packets that are in write queue : SYN packet and DATA + */ + skb_mstamp_get(&syn->skb_mstamp); + data->skb_mstamp = syn->skb_mstamp; + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { tp->syn_data = (fo->copied > 0); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } syn_data = NULL; @@ -3004,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk) * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ - if (tp->srtt) { - int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); + if (tp->srtt_us) { + int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), + TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -3133,7 +3243,7 @@ int tcp_write_wakeup(struct sock *sk) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) + if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 8b97d71e193..3b66610d415 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -38,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.1"); -static int port __read_mostly = 0; +static int port __read_mostly; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); @@ -46,7 +46,7 @@ static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); module_param(bufsize, uint, 0); -static unsigned int fwmark __read_mostly = 0; +static unsigned int fwmark __read_mostly; MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); module_param(fwmark, uint, 0); @@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, p->snd_wnd = tp->snd_wnd; p->rcv_wnd = tp->rcv_wnd; p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt >> 3; + p->srtt = tp->srtt_us >> 3; tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 19ea6c2951f..8250949b885 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,12 +15,11 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) @@ -39,7 +38,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "scalable", diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 64f0354c84c..286227abed1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -165,6 +165,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true); + if (tp->syn_data) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 06cae62bf20..9a5e05f27f4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) return min(tp->snd_ssthresh, tp->snd_cwnd-1); } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; @@ -306,11 +305,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 326475a9486..27b9825753d 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,19 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } /* limited by applications */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* We do the Veno calculations only if we got enough rtt samples */ @@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u64 target_cwnd; u32 rtt; @@ -203,7 +202,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 76a1e23259e..b94a04ae2ed 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a347a078ee0..599b79b8eac 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -3,7 +3,7 @@ * YeAH TCP * * For further details look at: - * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf * */ #include <linux/mm.h> @@ -15,13 +15,13 @@ #include "tcp_vegas.h" -#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck -#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt -#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss -#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion -#define TCP_YEAH_PHY 8 //lin maximum delta from base -#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss -#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count +#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */ +#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */ +#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */ +#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */ +#define TCP_YEAH_PHY 8 /* maximum delta from base */ +#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */ +#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */ #define TCP_SCALABLE_AI_CNT 100U @@ -69,13 +69,12 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) @@ -214,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { if (yeah->doing_reno_now < TCP_YEAH_RHO) { reduction = yeah->lastQ; - reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); - reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); } else reduction = max(tp->snd_cwnd>>1, 2U); @@ -227,11 +226,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { } static struct tcp_congestion_ops tcp_yeah __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, .cong_avoid = tcp_yeah_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 44f6a20fa29..7d5a8661df7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -223,7 +223,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = net_random(); + rand = prandom_u32(); first = (((u64)rand * remaining) >> 32) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); @@ -560,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { - struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) - return sk; - else - return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - udptable); + return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -731,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); - struct sk_buff *frags = skb_shinfo(skb)->frag_list; int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; - if (!frags) { + if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ @@ -746,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { + struct sk_buff *frags; + /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - do { + skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; - } while ((frags = frags->next)); + } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; @@ -766,6 +763,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) } EXPORT_SYMBOL_GPL(udp4_hwcsum); +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v4_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp_set_csum); + static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { struct sock *sk = skb->sk; @@ -789,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; @@ -906,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -935,7 +969,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, + sk->sk_family == AF_INET6); if (err) return err; if (ipc.opt) @@ -990,7 +1025,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk), faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); @@ -1230,7 +1265,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int peeked, off = 0; @@ -1498,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), @@ -1549,8 +1588,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; + } rc = 0; @@ -1603,12 +1645,16 @@ static void flush_stack(struct sock **stack, unsigned int count, kfree_skb(skb1); } -static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - struct dst_entry *dst = skb_dst(skb); + struct dst_entry *old; dst_hold(dst); - sk->sk_rx_dst = dst; + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); } /* @@ -1671,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { - const struct iphdr *iph; int err; UDP_SKB_CB(skb)->partial_cov = 0; @@ -1683,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return err; } - iph = ip_hdr(skb); - if (uh->check == 0) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - if (!skb_csum_unnecessary(skb)) - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, proto, 0); - /* Probably, we should checksum udp header (it should be in cache - * in any case) and data in tiny packets (< rx copybreak). - */ - - return 0; + return skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* @@ -1739,15 +1770,16 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - if (skb->sk) { + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); int ret; - sk = skb->sk; - if (unlikely(sk->sk_rx_dst == NULL)) - udp_sk_rx_dst_set(sk, skb); + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); ret = udp_queue_rcv_skb(sk, skb); - + sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ @@ -1832,6 +1864,10 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; + /* Do not bother scanning a too big list */ + if (hslot->count > 10) + return NULL; + rcu_read_lock(); begin: count = 0; @@ -1884,7 +1920,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; - INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr) + INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); rcu_read_lock(); @@ -1913,17 +1949,20 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, void udp_v4_early_demux(struct sk_buff *skb) { - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *uh = udp_hdr(skb); + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct udphdr *uh; struct sock *sk; struct dst_entry *dst; - struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return; + iph = ip_hdr(skb); + uh = udp_hdr(skb); + if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -1974,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); - int val; + int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); @@ -1984,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (get_user(val, (int __user *)optval)) return -EFAULT; + valbool = val ? 1 : 0; + switch (optname) { case UDP_CORK: if (val != 0) { @@ -2013,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } break; + case UDP_NO_CHECK6_TX: + up->no_check6_tx = valbool; + break; + + case UDP_NO_CHECK6_RX: + up->no_check6_rx = valbool; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -2095,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = up->encap_type; break; + case UDP_NO_CHECK6_TX: + val = up->no_check6_tx; + break; + + case UDP_NO_CHECK6_RX: + val = up->no_check6_rx; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: @@ -2474,11 +2531,16 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; netdev_features_t enc_features; - int outer_hlen; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; @@ -2490,17 +2552,25 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = skb_inner_network_offset(skb); skb->protocol = htons(ETH_P_TEB); + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; + } outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; skb = segs; do { struct udphdr *uh; - int udp_offset = outer_hlen - tnl_hlen; + int len; skb_reset_inner_headers(skb); skb->encapsulation = 1; @@ -2511,31 +2581,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; uh = udp_hdr(skb); - uh->len = htons(skb->len - udp_offset); + uh->len = htons(len); - /* csum segment if tunnel sets skb with csum. */ - if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) { - struct iphdr *iph = ip_hdr(skb); - - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - udp_offset, - IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, - skb->len - udp_offset, 0)); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; + if (need_csum) { + __be32 delta = htonl(oldlen + len); - } else if (protocol == htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - u32 len = skb->len - udp_offset; + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - len, IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; } skb->protocol = protocol; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 83206de2bc7..546d2d439dd 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,6 +14,17 @@ #include <net/udp.h> #include <net/protocol.h> +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; + +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + +struct udp_offload_priv { + struct udp_offload *offload; + struct rcu_head rcu; + struct udp_offload_priv __rcu *next; +}; + static int udp4_ufo_send_check(struct sk_buff *skb) { if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -41,6 +52,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; + int offset; + __wsum csum; + + if (skb->encapsulation && + (skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { + segs = skb_udp_tunnel_segment(skb, features); + goto out; + } mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -52,8 +72,10 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM | + SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; @@ -63,35 +85,162 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ - if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) - segs = skb_udp_tunnel_segment(skb, features); - else { - int offset; - __wsum csum; - - /* Do software UFO. Complete and fill in the UDP checksum as - * HW cannot do checksum of UDP packets sent as multiple - * IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - segs = skb_segment(skb, features); - } + segs = skb_segment(skb, features); out: return segs; } +int udp_add_offload(struct udp_offload *uo) +{ + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); + + if (!new_offload) + return -ENOMEM; + + new_offload->offload = uo; + + spin_lock(&udp_offload_lock); + new_offload->next = udp_offload_base; + rcu_assign_pointer(udp_offload_base, new_offload); + spin_unlock(&udp_offload_lock); + + return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ + struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); + kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ + struct udp_offload_priv __rcu **head = &udp_offload_base; + struct udp_offload_priv *uo_priv; + + spin_lock(&udp_offload_lock); + + uo_priv = udp_deref_protected(*head); + for (; uo_priv != NULL; + uo_priv = udp_deref_protected(*head)) { + if (uo_priv->offload == uo) { + rcu_assign_pointer(*head, + udp_deref_protected(uo_priv->next)); + goto unlock; + } + head = &uo_priv->next; + } + pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); +unlock: + spin_unlock(&udp_offload_lock); + if (uo_priv != NULL) + call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct udp_offload_priv *uo_priv; + struct sk_buff *p, **pp = NULL; + struct udphdr *uh, *uh2; + unsigned int hlen, off; + int flush = 1; + + if (NAPI_GRO_CB(skb)->udp_mark || + (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + goto out; + + /* mark that this skb passed once through the udp gro layer */ + NAPI_GRO_CB(skb)->udp_mark = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + uh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!uh)) + goto out; + } + + rcu_read_lock(); + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_receive) + goto unflush; + } + goto out_unlock; + +unflush: + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = (struct udphdr *)(p->data + off); + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct udp_offload_priv *uo_priv; + __be16 newlen = htons(skb->len - nhoff); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + int err = -ENOSYS; + + uh->len = newlen; + + rcu_read_lock(); + + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_complete) + break; + } + + if (uo_priv != NULL) + err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + + rcu_read_unlock(); + return err; +} + static const struct net_offload udpv4_offload = { .callbacks = { .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, + .gro_receive = udp_gro_receive, + .gro_complete = udp_gro_complete, }, }; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2c46acd4cc3..3b3efbda48e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 1f12c8b4586..aac6197b7a7 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -37,15 +37,6 @@ drop: return NET_RX_DROP; } -int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - return xfrm_input(skb, nexthdr, spi, encap_type); -} -EXPORT_SYMBOL(xfrm4_rcv_encap); - int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct iphdr *iph = ip_hdr(skb); diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e3db3f91511..71acd0014f2 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); skb_set_network_header(skb, -x->props.header_len - - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); if (x->sel.family != AF_INET6) skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 31b18152528..91771a7c802 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,65 +15,6 @@ #include <net/ip.h> #include <net/xfrm.h> -/* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; -static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); - -int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -EEXIST; - int priority = handler->priority; - - mutex_lock(&xfrm4_mode_tunnel_input_mutex); - - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t->priority > priority) - break; - if (t->priority == priority) - goto err; - - } - - handler->next = *pprev; - rcu_assign_pointer(*pprev, handler); - - ret = 0; - -err: - mutex_unlock(&xfrm4_mode_tunnel_input_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); - -int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -ENOENT; - - mutex_lock(&xfrm4_mode_tunnel_input_mutex); - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t == handler) { - *pprev = handler->next; - ret = 0; - break; - } - } - mutex_unlock(&xfrm4_mode_tunnel_input_mutex); - synchronize_net(); - - return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); - static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); @@ -117,24 +58,18 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(skb, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + ip_select_ident(skb, NULL); return 0; } -#define for_each_input_rcu(head, handler) \ - for (handler = rcu_dereference(head); \ - handler != NULL; \ - handler = rcu_dereference(handler->next)) - static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_tunnel_notifier *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; - for_each_input_rcu(rcv_notify_handlers, handler) - handler->handler(skb); - err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index baa0f63731f..d5f6bd9a210 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) goto out; mtu = dst_mtu(skb_dst(skb)); @@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - - skb->protocol = htons(ETH_P_IP); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; return x->outer_mode->output2(x, skb); } @@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER - if (!skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + skb->protocol = htons(ETH_P_IP); +#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IP); return xfrm_output(skb); } -int xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER + if (!x) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm4_output(struct sock *sk, struct sk_buff *skb) +{ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, dst->dev, - x->outer_mode->afinfo->output_finish, + NULL, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a63930a96..6156f68a1e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -325,6 +325,7 @@ void __init xfrm4_init(void) xfrm4_state_init(); xfrm4_policy_init(); + xfrm4_protocol_init(); #ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm4_net_ops); #endif diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 00000000000..a2ce0101eaa --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c @@ -0,0 +1,301 @@ +/* xfrm4_protocol.c - Generic xfrm protocol multiplexer. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/tunnel4.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_protocol_mutex); + +static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_handlers; + case IPPROTO_AH: + return &ah4_handlers; + case IPPROTO_COMP: + return &ipcomp4_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_cb); + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + if (!head) + goto out; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +static int xfrm4_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(esp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(esp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ah4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret;; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ah4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static const struct net_protocol esp4_protocol = { + .handler = xfrm4_esp_rcv, + .err_handler = xfrm4_esp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ah4_protocol = { + .handler = xfrm4_ah_rcv, + .err_handler = xfrm4_ah_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_ipcomp_rcv, + .err_handler = xfrm4_ipcomp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static struct xfrm_input_afinfo xfrm4_input_afinfo = { + .family = AF_INET, + .owner = THIS_MODULE, + .callback = xfrm4_rcv_cb, +}; + +static inline const struct net_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_protocol; + case IPPROTO_AH: + return &ah4_protocol; + case IPPROTO_COMP: + return &ipcomp4_protocol; + } + + return NULL; +} + +int xfrm4_protocol_register(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm4_protocol_mutex); + + if (add_netproto) { + if (inet_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_register); + +int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) { + if (inet_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm4_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_deregister); + +void __init xfrm4_protocol_init(void) +{ + xfrm_input_register_afinfo(&xfrm4_input_afinfo); +} +EXPORT_SYMBOL(xfrm4_protocol_init); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 0b2a0641526..542074c00c7 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -16,7 +16,7 @@ static int xfrm4_init_flags(struct xfrm_state *x) { - if (ipv4_config.no_pmtu_disc) + if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) x->props.flags |= XFRM_STATE_NOPMTUDISC; return 0; } |
