diff options
Diffstat (limited to 'net/ipv4')
135 files changed, 16645 insertions, 13936 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 20f1cb5c8ab..05c57f0fcab 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -9,10 +9,7 @@ config IP_MULTICAST intend to participate in the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. More information about the MBONE is on the WWW at - <http://www.savetz.com/mbone/>. Information about the multicast - capabilities of the various network cards is contained in - <file:Documentation/networking/multicast.txt>. For most people, it's - safe to say N. + <http://www.savetz.com/mbone/>. For most people, it's safe to say N. config IP_ADVANCED_ROUTER bool "IP: advanced router" @@ -166,6 +163,7 @@ config IP_PNP_RARP config NET_IPIP tristate "IP: tunneling" select INET_TUNNEL + select NET_IP_TUNNEL ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -186,9 +184,14 @@ config NET_IPGRE_DEMUX This is helper module to demultiplex GRE packets on GRE version field criteria. Required by ip_gre and pptp modules. +config NET_IP_TUNNEL + tristate + default n + config NET_IPGRE tristate "IP: GRE tunnels over IP" depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX + select NET_IP_TUNNEL help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -217,10 +220,8 @@ config IP_MROUTE packets that have several destination addresses. It is needed on the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. In order to do that, you would most - likely run the program mrouted. Information about the multicast - capabilities of the various network cards is contained in - <file:Documentation/networking/multicast.txt>. If you haven't heard - about it, you don't need it. + likely run the program mrouted. If you haven't heard about it, you + don't need it. config IP_MROUTE_MULTIPLE_TABLES bool "IP: multicast policy routing" @@ -258,22 +259,6 @@ config IP_PIMSM_V2 gated-5). This routing protocol is not used widely, so say N unless you want to play with it. -config ARPD - bool "IP: ARP daemon support" - ---help--- - The kernel maintains an internal cache which maps IP addresses to - hardware addresses on the local network, so that Ethernet - frames are sent to the proper address on the physical networking - layer. Normally, kernel uses the ARP protocol to resolve these - mappings. - - Saying Y here adds support to have an user space daemon to do this - resolution instead. This is useful for implementing an alternate - address resolution protocol (e.g. NHRP on mGRE tunnels) and also for - testing purposes. - - If unsure, say N. - config SYN_COOKIES bool "IP: TCP syncookie support" ---help--- @@ -310,6 +295,18 @@ config SYN_COOKIES If unsure, say N. +config NET_IPVTI + tristate "Virtual (secure) IP: tunneling" + select INET_TUNNEL + select NET_IP_TUNNEL + depends on INET_XFRM_MODE_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This can be used with xfrm mode tunnel to give + the notion of a secure tunnel for IPSEC and then use routing protocol + on top. + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO @@ -477,7 +474,6 @@ config TCP_CONG_HTCP config TCP_CONG_HSTCP tristate "High Speed TCP" - depends on EXPERIMENTAL default n ---help--- Sally Floyd's High Speed TCP (RFC 3649) congestion control. @@ -488,7 +484,6 @@ config TCP_CONG_HSTCP config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" - depends on EXPERIMENTAL default n ---help--- TCP-Hybla is a sender-side only change that eliminates penalization of @@ -498,7 +493,6 @@ config TCP_CONG_HYBLA config TCP_CONG_VEGAS tristate "TCP Vegas" - depends on EXPERIMENTAL default n ---help--- TCP Vegas is a sender-side only change to TCP that anticipates @@ -509,7 +503,6 @@ config TCP_CONG_VEGAS config TCP_CONG_SCALABLE tristate "Scalable TCP" - depends on EXPERIMENTAL default n ---help--- Scalable TCP is a sender-side only change to TCP which uses a @@ -519,7 +512,6 @@ config TCP_CONG_SCALABLE config TCP_CONG_LP tristate "TCP Low Priority" - depends on EXPERIMENTAL default n ---help--- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is @@ -529,7 +521,6 @@ config TCP_CONG_LP config TCP_CONG_VENO tristate "TCP Veno" - depends on EXPERIMENTAL default n ---help--- TCP Veno is a sender-side only enhancement of TCP to obtain better @@ -541,7 +532,6 @@ config TCP_CONG_VENO config TCP_CONG_YEAH tristate "YeAH TCP" - depends on EXPERIMENTAL select TCP_CONG_VEGAS default n ---help--- @@ -556,7 +546,6 @@ config TCP_CONG_YEAH config TCP_CONG_ILLINOIS tristate "TCP Illinois" - depends on EXPERIMENTAL default n ---help--- TCP-Illinois is a sender-side modification of TCP Reno for @@ -620,8 +609,7 @@ config DEFAULT_TCP_CONG default "cubic" config TCP_MD5SIG - bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "TCP: MD5 Signature Option support (RFC2385)" select CRYPTO select CRYPTO_MD5 ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3bbcd6..f032688d20d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,19 +7,22 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o \ - datagram.o raw.o udp.o udplite.o \ - arp.o icmp.o devinet.o af_inet.o igmp.o \ + tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ + tcp_offload.o datagram.o raw.o udp.o udplite.o \ + udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o ping.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o +obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o +gre-y := gre_demux.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o @@ -48,8 +51,8 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o +obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ - xfrm4_output.o + xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c8f7aee587d..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -111,10 +111,10 @@ #include <net/sock.h> #include <net/raw.h> #include <net/icmp.h> -#include <net/ipip.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/net_namespace.h> +#include <net/secure_seq.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif @@ -126,9 +126,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); - /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -157,6 +154,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -211,6 +209,26 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { + /* Check special setups for testing purpose to enable TFO w/o + * requiring TCP_FASTOPEN sockopt. + * Note that only TCP sockets (SOCK_STREAM) will reach here. + * Also fastopenq may already been allocated because this + * socket was in TCP_LISTEN state previously but was + * shutdown() (rather than close()). + */ + if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && + inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { + if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) + err = fastopen_init_queue(sk, backlog); + else if ((sysctl_tcp_fastopen & + TFO_SERVER_WO_SOCKOPT2) != 0) + err = fastopen_init_queue(sk, + ((uint)sysctl_tcp_fastopen) >> 16); + else + err = 0; + if (err) + goto out; + } err = inet_csk_listen_start(sk, backlog); if (err) goto out; @@ -224,41 +242,6 @@ out: } EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -/* - * inet_ehash_secret must be set exactly once - */ -void build_ehash_secret(void) -{ - u32 rnd; - - do { - get_random_bytes(&rnd, sizeof(rnd)); - } while (rnd == 0); - - cmpxchg(&inet_ehash_secret, 0, rnd); -} -EXPORT_SYMBOL(build_ehash_secret); - -static inline int inet_netns_ok(struct net *net, int protocol) -{ - int hash; - const struct net_protocol *ipprot; - - if (net_eq(net, &init_net)) - return 1; - - hash = protocol & (MAX_INET_PROTOS - 1); - ipprot = rcu_dereference(inet_protos[hash]); - - if (ipprot == NULL) - /* raw IP is OK */ - return 1; - return ipprot->netns_ok; -} - /* * Create an inet socket. */ @@ -271,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; - if (unlikely(!inet_ehash_secret)) - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) - build_ehash_secret(); - sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -327,16 +305,12 @@ lookup_protocol: } err = -EPERM; - if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) - goto out_rcu_unlock; - - err = -EAFNOSUPPORT; - if (!inet_netns_ok(net, protocol)) + if (sock->type == SOCK_RAW && !kern && + !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -348,7 +322,6 @@ lookup_protocol: goto out; err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; @@ -363,7 +336,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -454,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; int err; @@ -477,7 +451,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -497,7 +471,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + if (snum && snum < PROT_SOCK && + !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; /* We keep a pair of addresses. rcv_saddr is the one @@ -553,15 +528,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); + return sk->sk_prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); -static long inet_wait_for_connect(struct sock *sk, long timeo) +static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -577,6 +553,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); + sk->sk_write_pending -= writebias; return timeo; } @@ -584,8 +561,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; int err; @@ -594,8 +571,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; - lock_sock(sk); - if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; @@ -635,8 +610,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int writebias = (sk->sk_protocol == IPPROTO_TCP) && + tcp_sk(sk)->fastopen_req && + tcp_sk(sk)->fastopen_req->data ? 1 : 0; + /* Error code is set above */ - if (!timeo || !inet_wait_for_connect(sk, timeo)) + if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); @@ -658,7 +637,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; err = 0; out: - release_sock(sk); return err; sock_error: @@ -668,6 +646,18 @@ sock_error: sock->state = SS_DISCONNECTING; goto out; } +EXPORT_SYMBOL(__inet_stream_connect); + +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + int err; + + lock_sock(sock->sk); + err = __inet_stream_connect(sock, uaddr, addr_len, flags); + release_sock(sock->sk); + return err; +} EXPORT_SYMBOL(inet_stream_connect); /* @@ -687,7 +677,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) sock_rps_record_flow(sk2); WARN_ON(!((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); + (TCPF_ESTABLISHED | TCPF_SYN_RECV | + TCPF_CLOSE_WAIT | TCPF_CLOSE))); sock_graft(sk2, newsock); @@ -1008,7 +999,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, @@ -1018,7 +1008,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -1027,7 +1016,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }, @@ -1036,7 +1024,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; @@ -1136,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, false); + inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -1216,8 +1203,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) { + const struct net_offload *ops; const struct iphdr *iph; - const struct net_protocol *ops; int proto; int ihl; int err = -EINVAL; @@ -1230,47 +1217,55 @@ static int inet_gso_send_check(struct sk_buff *skb) if (ihl < sizeof(*iph)) goto out; + proto = iph->protocol; + + /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - __skb_pull(skb, ihl); + skb_reset_transport_header(skb); - iph = ip_hdr(skb); - proto = iph->protocol & (MAX_INET_PROTOS - 1); err = -EPROTONOSUPPORT; - rcu_read_lock(); - ops = rcu_dereference(inet_protos[proto]); - if (likely(ops && ops->gso_send_check)) - err = ops->gso_send_check(skb); - rcu_read_unlock(); + ops = rcu_dereference(inet_offloads[proto]); + if (likely(ops && ops->callbacks.gso_send_check)) + err = ops->callbacks.gso_send_check(skb); out: return err; } static struct sk_buff *inet_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + const struct net_offload *ops; + unsigned int offset = 0; + bool udpfrag, encap; struct iphdr *iph; - const struct net_protocol *ops; int proto; + int nhoff; int ihl; int id; - unsigned int offset = 0; - - if (!(features & NETIF_F_V4_CSUM)) - features &= ~NETIF_F_SG; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP | + SKB_GSO_SIT | + SKB_GSO_TCPV6 | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_MPLS | 0))) goto out; + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; @@ -1279,39 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (ihl < sizeof(*iph)) goto out; + id = ntohs(iph->id); + proto = iph->protocol; + + /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - __skb_pull(skb, ihl); + + encap = SKB_GSO_CB(skb)->encap_level > 0; + if (encap) + features = skb->dev->hw_enc_features & netif_skb_features(skb); + SKB_GSO_CB(skb)->encap_level += ihl; + skb_reset_transport_header(skb); - iph = ip_hdr(skb); - id = ntohs(iph->id); - proto = iph->protocol & (MAX_INET_PROTOS - 1); + segs = ERR_PTR(-EPROTONOSUPPORT); - rcu_read_lock(); - ops = rcu_dereference(inet_protos[proto]); - if (likely(ops && ops->gso_segment)) - segs = ops->gso_segment(skb, features); - rcu_read_unlock(); + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + udpfrag = proto == IPPROTO_UDP && encap; + else + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; + + ops = rcu_dereference(inet_offloads[proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); - if (!segs || IS_ERR(segs)) + if (IS_ERR_OR_NULL(segs)) goto out; skb = segs; do { - iph = ip_hdr(skb); - if (proto == IPPROTO_UDP) { + iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); + if (udpfrag) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); - offset += (skb->len - skb->mac_len - iph->ihl * 4); - } else + offset += skb->len - nhoff - ihl; + } else { iph->id = htons(id++); - iph->tot_len = htons(skb->len - skb->mac_len); - iph->check = 0; - iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); + } + iph->tot_len = htons(skb->len - nhoff); + ip_send_check(iph); + if (encap) + skb_reset_inner_headers(skb); + skb->network_header = (u8 *)iph - skb->head; } while ((skb = skb->next)); out: @@ -1321,7 +1330,7 @@ out: static struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - const struct net_protocol *ops; + const struct net_offload *ops; struct sk_buff **pp = NULL; struct sk_buff *p; const struct iphdr *iph; @@ -1340,21 +1349,21 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, goto out; } - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; rcu_read_lock(); - ops = rcu_dereference(inet_protos[proto]); - if (!ops || !ops->gro_receive) + ops = rcu_dereference(inet_offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) goto out_unlock; if (*(u8 *)iph != 0x45) goto out_unlock; - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out_unlock; id = ntohl(*(__be32 *)&iph->id); - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; for (p = *head; p; p = p->next) { @@ -1363,10 +1372,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ip_hdr(p); - + iph2 = (struct iphdr *)(p->data + off); + /* The above works because, with the exception of the top + * (inner most) layer, we only aggregate pkts with the same + * hdr length so all the hdrs we'll need to verify will start + * at the same offset. + */ if ((iph->protocol ^ iph2->protocol) | - (iph->tos ^ iph2->tos) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; @@ -1376,16 +1388,29 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, /* All fields must match except length and checksum. */ NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | - ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); + (iph->tos ^ iph2->tos) | + ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); + /* Save the IP ID check to be included later when we get to + * the transport layer so only the inner most IP ID is checked. + * This is because some GSO/TSO implementations do not + * correctly increment the IP ID for the outer hdrs. + */ + NAPI_GRO_CB(p)->flush_id = + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } NAPI_GRO_CB(skb)->flush |= flush; + skb_set_network_header(skb, off); + /* The above will be needed by the transport layer if there is one + * immediately following this IP hdr. + */ + skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = ops->gro_receive(head, skb); + pp = ops->callbacks.gro_receive(head, skb); out_unlock: rcu_read_unlock(); @@ -1396,23 +1421,30 @@ out: return pp; } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff) { - const struct net_protocol *ops; - struct iphdr *iph = ip_hdr(skb); - int proto = iph->protocol & (MAX_INET_PROTOS - 1); + __be16 newlen = htons(skb->len - nhoff); + struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); + const struct net_offload *ops; + int proto = iph->protocol; int err = -ENOSYS; - __be16 newlen = htons(skb->len - skb_network_offset(skb)); + + if (skb->encapsulation) + skb_set_inner_network_header(skb, nhoff); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; rcu_read_lock(); - ops = rcu_dereference(inet_protos[proto]); - if (WARN_ON(!ops || !ops->gro_complete)) + ops = rcu_dereference(inet_offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->gro_complete(skb); + /* Only need to add sizeof(*iph) to get to the next hdr below + * because any hdr with option will have been flushed in + * inet_gro_receive(). + */ + err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); @@ -1442,22 +1474,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; - int i, j; + int i; - for_each_possible_cpu(i) { - for (j = 0; j < SNMP_ARRAY_SZ; j++) - res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); - } + for_each_possible_cpu(i) + res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; @@ -1468,12 +1498,12 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) u64 v; unsigned int start; - bhptr = per_cpu_ptr(mib[0], cpu); + bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { - start = u64_stats_fetch_begin_bh(syncp); + start = u64_stats_fetch_begin_irq(syncp); v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_bh(syncp, start)); + } while (u64_stats_fetch_retry_irq(syncp, start)); res += v; } @@ -1482,36 +1512,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ - BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, align); - if (!ptr[0]) - return -ENOMEM; -#if SNMP_ARRAY_SZ == 2 - ptr[1] = __alloc_percpu(mibsize, align); - if (!ptr[1]) { - free_percpu(ptr[0]); - ptr[0] = NULL; - return -ENOMEM; - } -#endif - return 0; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - -void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) -{ - int i; - - BUG_ON(ptr == NULL); - for (i = 0; i < SNMP_ARRAY_SZ; i++) { - free_percpu(ptr[i]); - ptr[i] = NULL; - } -} -EXPORT_SYMBOL_GPL(snmp_mib_free); - #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1520,57 +1520,57 @@ static const struct net_protocol igmp_protocol = { #endif static const struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - .no_policy = 1, - .netns_ok = 1, + .early_demux = tcp_v4_early_demux, + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .no_policy = 1, + .netns_ok = 1, + .icmp_strict_tag_validation = 1, }; static const struct net_protocol udp_protocol = { + .early_demux = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, - .gso_send_check = udp4_ufo_send_check, - .gso_segment = udp4_ufo_fragment, .no_policy = 1, .netns_ok = 1, }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, - .err_handler = ping_err, + .err_handler = icmp_err, .no_policy = 1, .netns_ok = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) { - if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) + int i; + + net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); + if (!net->mib.tcp_statistics) goto err_tcp_mib; - if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ip_statistics) goto err_ip_mib; - if (snmp_mib_init((void __percpu **)net->mib.net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) + + for_each_possible_cpu(i) { + struct ipstats_mib *af_inet_stats; + af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); + u64_stats_init(&af_inet_stats->syncp); + } + + net->mib.net_statistics = alloc_percpu(struct linux_mib); + if (!net->mib.net_statistics) goto err_net_mib; - if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udp_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udp_statistics) goto err_udp_mib; - if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udplite_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_statistics) goto err_udplite_mib; - if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) + net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); + if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); @@ -1581,17 +1581,17 @@ static __net_init int ipv4_mib_init_net(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + free_percpu(net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + free_percpu(net->mib.udplite_statistics); err_udplite_mib: - snmp_mib_free((void __percpu **)net->mib.udp_statistics); + free_percpu(net->mib.udp_statistics); err_udp_mib: - snmp_mib_free((void __percpu **)net->mib.net_statistics); + free_percpu(net->mib.net_statistics); err_net_mib: - snmp_mib_free((void __percpu **)net->mib.ip_statistics); + free_percpu(net->mib.ip_statistics); err_ip_mib: - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } @@ -1599,12 +1599,12 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); - snmp_mib_free((void __percpu **)net->mib.udp_statistics); - snmp_mib_free((void __percpu **)net->mib.net_statistics); - snmp_mib_free((void __percpu **)net->mib.ip_statistics); - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.icmp_statistics); + free_percpu(net->mib.udplite_statistics); + free_percpu(net->mib.udp_statistics); + free_percpu(net->mib.net_statistics); + free_percpu(net->mib.ip_statistics); + free_percpu(net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1617,37 +1617,95 @@ static int __init init_ipv4_mibs(void) return register_pernet_subsys(&ipv4_mib_ops); } +static __net_init int inet_init_net(struct net *net) +{ + /* + * Set defaults for local port range + */ + seqlock_init(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = 32768; + net->ipv4.ip_local_ports.range[1] = 61000; + + seqlock_init(&net->ipv4.ping_group_range.lock); + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); + net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); + return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { + .init = inet_init_net, + .exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ + return register_pernet_subsys(&af_inet_ops); +} + static int ipv4_proc_init(void); /* * IP protocol layer initialiser */ +static struct packet_offload ip_packet_offload __read_mostly = { + .type = cpu_to_be16(ETH_P_IP), + .callbacks = { + .gso_send_check = inet_gso_send_check, + .gso_segment = inet_gso_segment, + .gro_receive = inet_gro_receive, + .gro_complete = inet_gro_complete, + }, +}; + +static const struct net_offload ipip_offload = { + .callbacks = { + .gso_send_check = inet_gso_send_check, + .gso_segment = inet_gso_segment, + }, +}; + +static int __init ipv4_offload_init(void) +{ + /* + * Add offloads + */ + if (udpv4_offload_init() < 0) + pr_crit("%s: Cannot add UDP protocol offload\n", __func__); + if (tcpv4_offload_init() < 0) + pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + + dev_add_offload(&ip_packet_offload); + inet_add_offload(&ipip_offload, IPPROTO_IPIP); + return 0; +} + +fs_initcall(ipv4_offload_init); + static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, - .gso_send_check = inet_gso_send_check, - .gso_segment = inet_gso_segment, - .gro_receive = inet_gro_receive, - .gro_complete = inet_gro_complete, }; static int __init inet_init(void) { - struct sk_buff *dummy_skb; struct inet_protosw *q; struct list_head *r; int rc = -EINVAL; - BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); - - sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); - if (!sysctl_local_reserved_ports) - goto out; + BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); rc = proto_register(&tcp_prot, 1); if (rc) - goto out_free_reserved_ports; + goto out; rc = proto_register(&udp_prot, 1); if (rc) @@ -1671,8 +1729,6 @@ static int __init inet_init(void) ip_static_sysctl_init(); #endif - tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; - /* * Add all the base protocols. */ @@ -1734,6 +1790,9 @@ static int __init inet_init(void) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif + + if (init_inet_pernet_ops()) + pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); /* * Initialise per-cpu ipv4 mibs */ @@ -1756,8 +1815,6 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); -out_free_reserved_ports: - kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e8f2617ecd4..a2afa89513a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; @@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } err = -ENOMEM; - iph = ah_alloc_tmp(ahash, nfrags, ihl); + iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; - - icv = ah_tmp_icv(ahash, iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; @@ -269,7 +283,11 @@ static void ah_input_done(struct crypto_async_request *base, int err) skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); - skb_set_transport_header(skb, -ihl); + + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -ihl); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); @@ -291,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; @@ -317,8 +339,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; @@ -332,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) iph = ip_hdr(skb); ihl = ip_hdrlen(skb); - work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + + ahp->icv_trunc_len + seqhi_len); if (!work_iph) goto out; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -358,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; @@ -381,7 +415,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); - skb_set_transport_header(skb, -ihl); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -ihl); err = nexthdr; @@ -391,24 +428,35 @@ out: return err; } -static void ah4_err(struct sk_buff *skb, u32 info) +static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) - return; - pr_debug("pmtu discovery on SA AH/%08x/%08x\n", - ntohl(ah->spi), ntohl(iph->daddr)); + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); + + return 0; } static int ah_init_state(struct xfrm_state *x) @@ -490,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x) kfree(ahp); } +static int ah4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} static const struct xfrm_type ah_type = { @@ -503,11 +555,12 @@ static const struct xfrm_type ah_type = .output = ah_output }; -static const struct net_protocol ah4_protocol = { +static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ah4_init(void) @@ -516,7 +569,7 @@ static int __init ah4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; @@ -526,7 +579,7 @@ static int __init ah4_init(void) static void __exit ah4_fini(void) { - if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cda37be02f8..1a9b99e0446 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -166,18 +166,20 @@ struct neigh_table arp_tbl = { .id = "arp_cache", .parms = { .tbl = &arp_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len_bytes = 64*1024, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, + .data = { + [NEIGH_VAR_MCAST_PROBES] = 3, + [NEIGH_VAR_UCAST_PROBES] = 3, + [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, + [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, + [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_GC_STALETIME] = 60 * HZ, + [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_PROXY_QLEN] = 64, + [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, + [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, + [NEIGH_VAR_LOCKTIME] = 1 * HZ, + }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, @@ -321,7 +323,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; - u8 *dst_ha = NULL; + u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); @@ -359,30 +361,27 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); - probes -= neigh->parms->ucast_probes; + probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(neigh->nud_state & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); - dst_ha = neigh->ha; - read_lock_bh(&neigh->lock); + neigh_ha_snapshot(dst_ha, neigh, dev); + dst_hw = dst_ha; } else { - probes -= neigh->parms->app_probes; + probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { -#ifdef CONFIG_ARPD neigh_app_ns(neigh); -#endif return; } } arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_ha, dev->dev_addr, NULL); - if (dst_ha) - read_unlock_bh(&neigh->lock); + dst_hw, dev->dev_addr, NULL); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { + struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { @@ -401,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; + in_dev = NULL; break; case 4: /* Reserved */ case 5: @@ -412,7 +412,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) default: return 0; } - return !inet_confirm_addr(in_dev, sip, tip, scope); + return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) @@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) return 1; } - paddr = skb_rtable(skb)->rt_gateway; - + paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) return 0; @@ -657,11 +656,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; - if (target_hw != NULL) - memcpy(arp_ptr, target_hw, dev->addr_len); - else - memset(arp_ptr, 0, dev->addr_len); - arp_ptr += dev->addr_len; + + switch (dev->type) { +#if IS_ENABLED(CONFIG_FIREWIRE_NET) + case ARPHRD_IEEE1394: + break; +#endif + default: + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr += dev->addr_len; + } memcpy(arp_ptr, &dest_ip, 4); return skb; @@ -725,6 +732,7 @@ static int arp_process(struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. @@ -784,13 +792,21 @@ static int arp_process(struct sk_buff *skb) arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; - arp_ptr += dev->addr_len; + switch (dev_type) { +#if IS_ENABLED(CONFIG_FIREWIRE_NET) + case ARPHRD_IEEE1394: + break; +#endif + default: + arp_ptr += dev->addr_len; + } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ - if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + if (ipv4_is_multicast(tip) || + (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out; /* @@ -860,7 +876,7 @@ static int arp_process(struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || - in_dev->arp_parms->proxy_delay == 0) { + NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha); @@ -883,10 +899,12 @@ static int arp_process(struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ + is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && + inet_addr_type(net, sip) == RTN_UNICAST; + if (n == NULL && - (arp->ar_op == htons(ARPOP_REPLY) || - (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && - inet_addr_type(net, sip) == RTN_UNICAST) + ((arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -899,7 +917,10 @@ static int arp_process(struct sk_buff *skb) agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ - override = time_after(jiffies, n->updated + n->parms->locktime); + override = time_after(jiffies, + n->updated + + NEIGH_VAR(n->parms, LOCKTIME)) || + is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. @@ -930,24 +951,25 @@ static void parp_redo(struct sk_buff *skb) static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct arphdr *arp; + const struct arphdr *arp; + + if (dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK) + goto freeskb; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto freeskb; arp = arp_hdr(skb); - if (arp->ar_hln != dev->addr_len || - dev->flags & IFF_NOARP || - skb->pkt_type == PACKET_OTHERHOST || - skb->pkt_type == PACKET_LOOPBACK || - arp->ar_pln != 4) + if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) goto freeskb; - skb = skb_share_check(skb, GFP_ATOMIC); - if (skb == NULL) - goto out_of_mem; - memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); @@ -1095,7 +1117,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -int arp_invalidate(struct net_device *dev, __be32 ip) +static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; @@ -1110,7 +1132,6 @@ int arp_invalidate(struct net_device *dev, __be32 ip) return err; } -EXPORT_SYMBOL(arp_invalidate); static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) @@ -1161,7 +1182,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); @@ -1220,12 +1241,18 @@ out: static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_change_info *change_info; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); + break; + case NETDEV_CHANGE: + change_info = ptr; + if (change_info->flags_changed & IFF_NOARP) + neigh_changeaddr(&arp_tbl, dev); break; default: break; @@ -1266,7 +1293,7 @@ void __init arp_init(void) dev_add_pack(&arp_packet_type); arp_proc_init(); #ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); + neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); } @@ -1406,14 +1433,14 @@ static const struct file_operations arp_seq_fops = { static int __net_init arp_net_init(struct net *net) { - if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops)) + if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops)) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { - proc_net_remove(net, "arp"); + remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c48adc565e9..69e77c8ff28 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -31,8 +31,7 @@ * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ @@ -1336,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1432,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1527,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1725,8 +1721,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is - * not the loopback device drop the packet. */ - if (!(skb->dev->flags & IFF_LOOPBACK)) { + * not the loopback device drop the packet. Further, + * there is no legitimate reason for setting this from + * userspace so reject it if skb is NULL. */ + if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 424fafbc8cb..a3095fdefbe 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -53,11 +53,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, true); + inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) - IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -85,3 +85,38 @@ out: return err; } EXPORT_SYMBOL(ip4_datagram_connect); + +/* Because UDP xmit path can manipulate sk_dst_cache without holding + * socket lock, we need to use sk_dst_set() here, + * even if we own the socket lock. + */ +void ip4_datagram_release_cb(struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + struct dst_entry *dst; + struct flowi4 fl4; + struct rtable *rt; + + rcu_read_lock(); + + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { + rcu_read_unlock(); + return; + } + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_protocol, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + + dst = !IS_ERR(rt) ? &rt->dst : NULL; + sk_dst_set(sk, dst); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 10e15a144e9..e9449376b58 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -55,6 +55,7 @@ #include <linux/sysctl.h> #endif #include <linux/kmod.h> +#include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> @@ -62,6 +63,7 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/addrconf.h> #include "fib_lookup.h" @@ -71,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = { [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, + [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, }, }; @@ -81,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, + [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, }, }; @@ -92,38 +98,34 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, + [IFA_FLAGS] = { .type = NLA_U32 }, }; -/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE - * value. So if you change this define, make appropriate changes to - * inet_addr_hash as well. - */ -#define IN4_ADDR_HSIZE 256 +#define IN4_ADDR_HSIZE_SHIFT 8 +#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) + static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static DEFINE_SPINLOCK(inet_addr_hash_lock); -static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) +static u32 inet_addr_hash(struct net *net, __be32 addr) { - u32 val = (__force u32) addr ^ hash_ptr(net, 8); + u32 val = (__force u32) addr ^ net_hash_mix(net); - return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & - (IN4_ADDR_HSIZE - 1)); + return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { - unsigned int hash = inet_addr_hash(net, ifa->ifa_local); + u32 hash = inet_addr_hash(net, ifa->ifa_local); - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); - spin_unlock(&inet_addr_hash_lock); } static void inet_hash_remove(struct in_ifaddr *ifa) { - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_del_init_rcu(&ifa->hash); - spin_unlock(&inet_addr_hash_lock); } /** @@ -136,18 +138,17 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - unsigned int hash = inet_addr_hash(net, addr); + u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; - struct hlist_node *node; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { if (ifa->ifa_local == addr) { + struct net_device *dev = ifa->ifa_dev->dev; + + if (!net_eq(dev_net(dev), net)) + continue; result = dev; break; } @@ -182,10 +183,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, static void devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else -static inline void devinet_sysctl_register(struct in_device *idev) +static void devinet_sysctl_register(struct in_device *idev) { } -static inline void devinet_sysctl_unregister(struct in_device *idev) +static void devinet_sysctl_unregister(struct in_device *idev) { } #endif @@ -205,7 +206,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head) kfree(ifa); } -static inline void inet_free_ifa(struct in_ifaddr *ifa) +static void inet_free_ifa(struct in_ifaddr *ifa) { call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); } @@ -216,6 +217,7 @@ void in_dev_finish_destroy(struct in_device *idev) WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); + kfree(rcu_dereference_protected(idev->mc_hash, 1)); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif @@ -314,7 +316,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy, struct nlmsghdr *nlh, u32 pid) + int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1 = *ifap; @@ -348,7 +350,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; - rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); + rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); @@ -385,7 +387,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ - rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid); + rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { @@ -398,7 +400,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } promote->ifa_flags &= ~IFA_F_SECONDARY; - rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); + rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { @@ -419,8 +421,12 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } +static void check_lifetime(struct work_struct *work); + +static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); + static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 pid) + u32 portid) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -455,7 +461,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { - net_srandom(ifa->ifa_local); + prandom_seed((__force u32) ifa->ifa_local); ifap = last_primary; } @@ -464,10 +470,13 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); + cancel_delayed_work(&check_lifetime_work); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); + /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ - rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid); + rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; @@ -489,6 +498,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) return -ENOBUFS; } ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); if (ifa->ifa_dev != in_dev) { WARN_ON(ifa->ifa_dev); in_dev_hold(in_dev); @@ -530,7 +540,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } -static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -566,7 +576,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) continue; - __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid); + __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -575,7 +585,132 @@ errout: return err; } -static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) +#define INFINITY_LIFE_TIME 0xFFFFFFFF + +static void check_lifetime(struct work_struct *work) +{ + unsigned long now, next, next_sec, next_sched; + struct in_ifaddr *ifa; + struct hlist_node *n; + int i; + + now = jiffies; + next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); + + for (i = 0; i < IN4_ADDR_HSIZE; i++) { + bool change_needed = false; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { + unsigned long age; + + if (ifa->ifa_flags & IFA_F_PERMANENT) + continue; + + /* We try to batch several events at once. */ + age = (now - ifa->ifa_tstamp + + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && + age >= ifa->ifa_valid_lft) { + change_needed = true; + } else if (ifa->ifa_preferred_lft == + INFINITY_LIFE_TIME) { + continue; + } else if (age >= ifa->ifa_preferred_lft) { + if (time_before(ifa->ifa_tstamp + + ifa->ifa_valid_lft * HZ, next)) + next = ifa->ifa_tstamp + + ifa->ifa_valid_lft * HZ; + + if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) + change_needed = true; + } else if (time_before(ifa->ifa_tstamp + + ifa->ifa_preferred_lft * HZ, + next)) { + next = ifa->ifa_tstamp + + ifa->ifa_preferred_lft * HZ; + } + } + rcu_read_unlock(); + if (!change_needed) + continue; + rtnl_lock(); + hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { + unsigned long age; + + if (ifa->ifa_flags & IFA_F_PERMANENT) + continue; + + /* We try to batch several events at once. */ + age = (now - ifa->ifa_tstamp + + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && + age >= ifa->ifa_valid_lft) { + struct in_ifaddr **ifap; + + for (ifap = &ifa->ifa_dev->ifa_list; + *ifap != NULL; ifap = &(*ifap)->ifa_next) { + if (*ifap == ifa) { + inet_del_ifa(ifa->ifa_dev, + ifap, 1); + break; + } + } + } else if (ifa->ifa_preferred_lft != + INFINITY_LIFE_TIME && + age >= ifa->ifa_preferred_lft && + !(ifa->ifa_flags & IFA_F_DEPRECATED)) { + ifa->ifa_flags |= IFA_F_DEPRECATED; + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); + } + } + rtnl_unlock(); + } + + next_sec = round_jiffies_up(next); + next_sched = next; + + /* If rounded timeout is accurate enough, accept it. */ + if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) + next_sched = next_sec; + + now = jiffies; + /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ + if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) + next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; + + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, + next_sched - now); +} + +static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, + __u32 prefered_lft) +{ + unsigned long timeout; + + ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) + ifa->ifa_valid_lft = timeout; + else + ifa->ifa_flags |= IFA_F_PERMANENT; + + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa->ifa_flags |= IFA_F_DEPRECATED; + ifa->ifa_preferred_lft = timeout; + } + ifa->ifa_tstamp = jiffies; + if (!ifa->ifa_cstamp) + ifa->ifa_cstamp = ifa->ifa_tstamp; +} + +static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, + __u32 *pvalid_lft, __u32 *pprefered_lft) { struct nlattr *tb[IFA_MAX+1]; struct in_ifaddr *ifa; @@ -612,6 +747,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) goto errout; ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); in_dev_hold(in_dev); if (tb[IFA_ADDRESS] == NULL) @@ -620,7 +756,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); - ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : + ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; @@ -635,31 +772,87 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + if (tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci; + + ci = nla_data(tb[IFA_CACHEINFO]); + if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { + err = -EINVAL; + goto errout_free; + } + *pvalid_lft = ci->ifa_valid; + *pprefered_lft = ci->ifa_prefered; + } + return ifa; +errout_free: + inet_free_ifa(ifa); errout: return ERR_PTR(err); } -static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct in_ifaddr *ifa1, **ifap; + + if (!ifa->ifa_local) + return NULL; + + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; + ifap = &ifa1->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa) && + ifa1->ifa_local == ifa->ifa_local) + return ifa1; + } + return NULL; +} + +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa; + struct in_ifaddr *ifa_existing; + __u32 valid_lft = INFINITY_LIFE_TIME; + __u32 prefered_lft = INFINITY_LIFE_TIME; ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh); + ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); if (IS_ERR(ifa)) return PTR_ERR(ifa); - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid); + ifa_existing = find_matching_ifa(ifa); + if (!ifa_existing) { + /* It would be best to check for !NLM_F_CREATE here but + * userspace already relies on not having to provide this. + */ + set_ifa_lifetime(ifa, valid_lft, prefered_lft); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + } else { + inet_free_ifa(ifa); + + if (nlh->nlmsg_flags & NLM_F_EXCL || + !(nlh->nlmsg_flags & NLM_F_REPLACE)) + return -EEXIST; + ifa = ifa_existing; + set_ifa_lifetime(ifa, valid_lft, prefered_lft); + cancel_delayed_work(&check_lifetime_work); + queue_delayed_work(system_power_efficient_wq, + &check_lifetime_work, 0); + rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); + blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + } + return 0; } /* * Determine a default network mask, based on the IP address. */ -static inline int inet_abc_len(__be32 addr) +static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ @@ -725,16 +918,16 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; case SIOCSIFFLAGS: - ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + ret = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ - ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + ret = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) @@ -825,9 +1018,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; ifa = inet_alloc_ifa(); - INIT_HLIST_NODE(&ifa->hash); if (!ifa) break; + INIT_HLIST_NODE(&ifa->hash); if (colon) memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); else @@ -854,6 +1047,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } + set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; @@ -939,10 +1133,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) if (len < (int) sizeof(ifr)) break; memset(&ifr, 0, sizeof(struct ifreq)); - if (ifa->ifa_label) - strcpy(ifr.ifr_name, ifa->ifa_label); - else - strcpy(ifr.ifr_name, dev->name); + strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = @@ -1048,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, /* * Confirm that local IP address exists using wildcards: - * - in_dev: only on this interface, 0=any interface + * - net: netns to check, cannot be NULL + * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ -__be32 inet_confirm_addr(struct in_device *in_dev, +__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; - struct net *net; - if (scope != RT_SCOPE_LINK) + if (in_dev != NULL) return confirm_addr_indev(in_dev, dst, local, scope); - net = dev_net(in_dev->dev); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); @@ -1124,7 +1314,7 @@ skip: } } -static inline bool inetdev_valid_mtu(unsigned int mtu) +static bool inetdev_valid_mtu(unsigned int mtu) { return mtu >= 68; } @@ -1149,7 +1339,7 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); @@ -1192,6 +1382,10 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, ifa->ifa_dev = in_dev; ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, + INFINITY_LIFE_TIME); + ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } @@ -1239,32 +1433,74 @@ static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; -static inline size_t inet_nlmsg_size(void) +static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ + + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ +} + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} + +static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, + unsigned long tstamp, u32 preferred, u32 valid) +{ + struct ifa_cacheinfo ci; + + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); + ci.ifa_prefered = preferred; + ci.ifa_valid = valid; + + return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; + u32 preferred, valid; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; - ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; + ifm->ifa_flags = ifa->ifa_flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { + preferred = ifa->ifa_preferred_lft; + valid = ifa->ifa_valid_lft; + if (preferred != INFINITY_LIFE_TIME) { + long tval = (jiffies - ifa->ifa_tstamp) / HZ; + + if (preferred > tval) + preferred -= tval; + else + preferred = 0; + if (valid != INFINITY_LIFE_TIME) { + if (valid > tval) + valid -= tval; + else + valid = 0; + } + } + } else { + preferred = INFINITY_LIFE_TIME; + valid = INFINITY_LIFE_TIME; + } if ((ifa->ifa_address && nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && @@ -1272,7 +1508,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, (ifa->ifa_broadcast && nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && - nla_put_string(skb, IFA_LABEL, ifa->ifa_label))) + nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || + put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, + preferred, valid)) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -1292,7 +1531,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) struct in_device *in_dev; struct in_ifaddr *ifa; struct hlist_head *head; - struct hlist_node *node; s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -1302,7 +1540,9 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; if (h > s_h || idx > s_idx) @@ -1316,12 +1556,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) if (ip_idx < s_ip_idx) continue; if (inet_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDR, NLM_F_MULTI) <= 0) { rcu_read_unlock(); goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } cont: idx++; @@ -1338,7 +1579,7 @@ done: } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 pid) + u32 portid) { struct sk_buff *skb; u32 seq = nlh ? nlh->nlmsg_seq : 0; @@ -1350,14 +1591,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, if (skb == NULL) goto errout; - err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0); + err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: if (err < 0) @@ -1445,6 +1686,232 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) return 0; } +static int inet_netconf_msgsize_devconf(int type) +{ + int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + + nla_total_size(4); /* NETCONFA_IFINDEX */ + + /* type -1 is used for ALL */ + if (type == -1 || type == NETCONFA_FORWARDING) + size += nla_total_size(4); + if (type == -1 || type == NETCONFA_RP_FILTER) + size += nla_total_size(4); + if (type == -1 || type == NETCONFA_MC_FORWARDING) + size += nla_total_size(4); + if (type == -1 || type == NETCONFA_PROXY_NEIGH) + size += nla_total_size(4); + + return size; +} + +static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, + struct ipv4_devconf *devconf, u32 portid, + u32 seq, int event, unsigned int flags, + int type) +{ + struct nlmsghdr *nlh; + struct netconfmsg *ncm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), + flags); + if (nlh == NULL) + return -EMSGSIZE; + + ncm = nlmsg_data(nlh); + ncm->ncm_family = AF_INET; + + if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) + goto nla_put_failure; + + /* type -1 is used for ALL */ + if ((type == -1 || type == NETCONFA_FORWARDING) && + nla_put_s32(skb, NETCONFA_FORWARDING, + IPV4_DEVCONF(*devconf, FORWARDING)) < 0) + goto nla_put_failure; + if ((type == -1 || type == NETCONFA_RP_FILTER) && + nla_put_s32(skb, NETCONFA_RP_FILTER, + IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) + goto nla_put_failure; + if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + nla_put_s32(skb, NETCONFA_MC_FORWARDING, + IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) + goto nla_put_failure; + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, + IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, + struct ipv4_devconf *devconf) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, + RTM_NEWNETCONF, 0, type); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); +} + +static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { + [NETCONFA_IFINDEX] = { .len = sizeof(int) }, + [NETCONFA_FORWARDING] = { .len = sizeof(int) }, + [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, +}; + +static int inet_netconf_get_devconf(struct sk_buff *in_skb, + struct nlmsghdr *nlh) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NETCONFA_MAX+1]; + struct netconfmsg *ncm; + struct sk_buff *skb; + struct ipv4_devconf *devconf; + struct in_device *in_dev; + struct net_device *dev; + int ifindex; + int err; + + err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, + devconf_ipv4_policy); + if (err < 0) + goto errout; + + err = EINVAL; + if (!tb[NETCONFA_IFINDEX]) + goto errout; + + ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); + switch (ifindex) { + case NETCONFA_IFINDEX_ALL: + devconf = net->ipv4.devconf_all; + break; + case NETCONFA_IFINDEX_DEFAULT: + devconf = net->ipv4.devconf_dflt; + break; + default: + dev = __dev_get_by_index(net, ifindex); + if (dev == NULL) + goto errout; + in_dev = __in_dev_get_rtnl(dev); + if (in_dev == NULL) + goto errout; + devconf = &in_dev->cnf; + break; + } + + err = -ENOBUFS; + skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet_netconf_fill_devconf(skb, ifindex, devconf, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWNETCONF, 0, + -1); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +errout: + return err; +} + +static int inet_netconf_dump_devconf(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, s_idx; + struct net_device *dev; + struct in_device *in_dev; + struct hlist_head *head; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto cont; + + if (inet_netconf_fill_devconf(skb, dev->ifindex, + &in_dev->cnf, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + -1) <= 0) { + rcu_read_unlock(); + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + rcu_read_unlock(); + } + if (h == NETDEV_HASHENTRIES) { + if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } + if (h == NETDEV_HASHENTRIES + 1) { + if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } +done: + cb->args[0] = h; + cb->args[1] = idx; + + return skb->len; +} + #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) @@ -1470,6 +1937,12 @@ static void inet_forward_change(struct net *net) IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; + inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all); + inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; @@ -1477,13 +1950,29 @@ static void inet_forward_change(struct net *net) dev_disable_lro(dev); rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); - if (in_dev) + if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); + inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + dev->ifindex, &in_dev->cnf); + } rcu_read_unlock(); } } -static int devinet_conf_proc(ctl_table *ctl, int write, +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ + if (cnf == net->ipv4.devconf_dflt) + return NETCONFA_IFINDEX_DEFAULT; + else if (cnf == net->ipv4.devconf_all) + return NETCONFA_IFINDEX_ALL; + else { + struct in_device *idev + = container_of(cnf, struct in_device, cnf); + return idev->dev->ifindex; + } +} + +static int devinet_conf_proc(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -1495,20 +1984,35 @@ static int devinet_conf_proc(ctl_table *ctl, int write, struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; + int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); - if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) + if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || + i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) - rt_cache_flush(net, 0); + rt_cache_flush(net); + + if (i == IPV4_DEVCONF_RP_FILTER - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, + ifindex, cnf); + } + if (i == IPV4_DEVCONF_PROXY_ARP - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + ifindex, cnf); + } } return ret; } -static int devinet_sysctl_forward(ctl_table *ctl, int write, +static int devinet_sysctl_forward(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -1529,21 +2033,29 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); - } else if (*valp) { + } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); - dev_disable_lro(idev->dev); + if (*valp) + dev_disable_lro(idev->dev); + inet_netconf_notify_devconf(net, + NETCONFA_FORWARDING, + idev->dev->ifindex, + cnf); } rtnl_unlock(); - rt_cache_flush(net, 0); - } + rt_cache_flush(net); + } else + inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt); } return ret; } -static int ipv4_doint_and_flush(ctl_table *ctl, int write, +static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -1553,7 +2065,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(net, 0); + rt_cache_flush(net); return ret; } @@ -1610,13 +2122,19 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), + DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, + "force_igmp_version"), + DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, + "igmpv2_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, + "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), - DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, - "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), + DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, + "route_localnet"), }, }; @@ -1666,7 +2184,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) static void devinet_sysctl_register(struct in_device *idev) { - neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); + neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, &idev->cnf); } @@ -1804,10 +2322,14 @@ void __init devinet_init(void) register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); + rtnl_af_register(&inet_af_ops); rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); + rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, + inet_netconf_dump_devconf, NULL); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index cb982a61536..360b565918c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct aead_givcrypt_request *req; struct scatterlist *sg; struct scatterlist *asg; - struct esp_data *esp; struct sk_buff *trailer; void *tmp; u8 *iv; @@ -139,10 +138,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) /* skb is pure payload to encrypt */ - err = -ENOMEM; - - esp = x->data; - aead = esp->aead; + aead = x->data; alen = crypto_aead_authsize(aead); tfclen = 0; @@ -156,8 +152,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); - if (esp->padlen) - clen = ALIGN(clen, esp->padlen); plen = clen - skb->len - tfclen; err = skb_cow_data(skb, tfclen + plen + alen, &trailer); @@ -176,8 +170,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); - if (!tmp) + if (!tmp) { + err = -ENOMEM; goto error; + } seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); @@ -280,8 +276,7 @@ static int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int elen = skb->len - hlen; @@ -346,7 +341,10 @@ static int esp_input_done2(struct sk_buff *skb, int err) pskb_trim(skb, skb->len - alen - padlen - 2); __skb_pull(skb, hlen); - skb_set_transport_header(skb, -ihl); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -ihl); err = nexthdr[1]; @@ -373,8 +371,7 @@ static void esp_input_done(struct crypto_async_request *base, int err) static int esp_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); @@ -456,9 +453,8 @@ out: static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) { - struct esp_data *esp = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); - u32 align = max_t(u32, blksize, esp->padlen); + struct crypto_aead *aead = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); unsigned int net_adj; switch (x->props.mode) { @@ -473,44 +469,53 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) BUG(); } - return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - - net_adj) & ~(align - 1)) + (net_adj - 2); + return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - + net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp4_err(struct sk_buff *skb, u32 info) +static int esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) - return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr)); + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); + + return 0; } static void esp_destroy(struct xfrm_state *x) { - struct esp_data *esp = x->data; + struct crypto_aead *aead = x->data; - if (!esp) + if (!aead) return; - crypto_free_aead(esp->aead); - kfree(esp); + crypto_free_aead(aead); } static int esp_init_aead(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; int err; @@ -519,7 +524,7 @@ static int esp_init_aead(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; err = crypto_aead_setkey(aead, x->aead->alg_key, (x->aead->alg_key_len + 7) / 8); @@ -536,7 +541,6 @@ error: static int esp_init_authenc(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; struct crypto_authenc_key_param *param; struct rtattr *rta; @@ -571,7 +575,7 @@ static int esp_init_authenc(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -626,16 +630,11 @@ error: static int esp_init_state(struct xfrm_state *x) { - struct esp_data *esp; struct crypto_aead *aead; u32 align; int err; - esp = kzalloc(sizeof(*esp), GFP_KERNEL); - if (esp == NULL) - return -ENOMEM; - - x->data = esp; + x->data = NULL; if (x->aead) err = esp_init_aead(x); @@ -645,9 +644,7 @@ static int esp_init_state(struct xfrm_state *x) if (err) goto error; - aead = esp->aead; - - esp->padlen = 0; + aead = x->data; x->props.header_len = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -671,14 +668,17 @@ static int esp_init_state(struct xfrm_state *x) } align = ALIGN(crypto_aead_blocksize(aead), 4); - if (esp->padlen) - align = max_t(u32, align, esp->padlen); - x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); error: return err; } +static int esp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp_type = { .description = "ESP4", @@ -692,11 +692,12 @@ static const struct xfrm_type esp_type = .output = esp_output }; -static const struct net_protocol esp4_protocol = { +static struct xfrm4_protocol esp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = esp4_rcv_cb, .err_handler = esp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init esp4_init(void) @@ -705,7 +706,7 @@ static int __init esp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp_type, AF_INET); return -EAGAIN; @@ -715,7 +716,7 @@ static int __init esp4_init(void) static void __exit esp4_fini(void) { - if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3854411fa37..255aa9946fe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -31,6 +31,7 @@ #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> +#include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> @@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id) tb = fib_trie_table(id); if (!tb) return NULL; + + switch (id) { + case RT_TABLE_LOCAL: + net->ipv4.fib_local = tb; + break; + + case RT_TABLE_MAIN: + net->ipv4.fib_main = tb; + break; + + case RT_TABLE_DEFAULT: + net->ipv4.fib_default = tb; + break; + + default: + break; + } + h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; @@ -93,7 +112,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id) struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; - struct hlist_node *node; struct hlist_head *head; unsigned int h; @@ -103,7 +121,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id) rcu_read_lock(); head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (tb->tb_id == id) { rcu_read_unlock(); return tb; @@ -118,18 +136,17 @@ static void fib_flush(struct net *net) { int flushed = 0; struct fib_table *tb; - struct hlist_node *node; struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, node, head, tb_hlist) + hlist_for_each_entry(tb, head, tb_hlist) flushed += fib_table_flush(tb); } if (flushed) - rt_cache_flush(net, -1); + rt_cache_flush(net); } /* @@ -150,10 +167,6 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (ipv4_is_multicast(addr)) return RTN_MULTICAST; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; @@ -180,6 +193,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, } EXPORT_SYMBOL(inet_dev_addr_type); +__be32 fib_compute_spec_dst(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct rtable *rt; + struct flowi4 fl4; + struct net *net; + int scope; + + rt = skb_rtable(skb); + if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == + RTCF_LOCAL) + return ip_hdr(skb)->daddr; + + in_dev = __in_dev_get_rcu(dev); + BUG_ON(!in_dev); + + net = dev_net(dev); + + scope = RT_SCOPE_UNIVERSE; + if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { + fl4.flowi4_oif = 0; + fl4.flowi4_iif = LOOPBACK_IFINDEX; + fl4.daddr = ip_hdr(skb)->saddr; + fl4.saddr = 0; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_scope = scope; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + if (!fib_lookup(net, &fl4, &res)) + return FIB_RES_PREFSRC(net, res); + } else { + scope = RT_SCOPE_LINK; + } + + return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); +} + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -188,39 +239,27 @@ EXPORT_SYMBOL(inet_dev_addr_type); * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, __be32 *spec_dst, - u32 *itag) +static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + int rpf, struct in_device *idev, u32 *itag) { - struct in_device *in_dev; - struct flowi4 fl4; + int ret, no_addr, accept_local; struct fib_result res; - int no_addr, rpf, accept_local; - bool dev_match; - int ret; + struct flowi4 fl4; struct net *net; + bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif; + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - no_addr = rpf = accept_local = 0; - in_dev = __in_dev_get_rcu(dev); - if (in_dev) { - no_addr = in_dev->ifa_list == NULL; - - /* Ignore rp_filter for packets protected by IPsec. */ - rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); - - accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); - fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - } + no_addr = idev->ifa_list == NULL; - if (in_dev == NULL) - goto e_inval; + accept_local = IN_DEV_ACCEPT_LOCAL(idev); + fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); if (fib_lookup(net, &fl4, &res)) @@ -229,7 +268,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -258,17 +296,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { - if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(net, res); + if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; - } } return ret; last_resort: if (rpf) goto e_rpf; - *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; @@ -278,6 +313,21 @@ e_rpf: return -EXDEV; } +/* Ignore rp_filter for packets protected by IPsec. */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + + if (!r && !fib_num_tclassid_users(dev_net(dev)) && + (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + *itag = 0; + return 0; + } + return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); +} + static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; @@ -436,7 +486,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&rt, arg, sizeof(rt))) @@ -506,7 +556,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; - cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; @@ -554,7 +604,7 @@ errout: return err; } -static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -576,7 +626,7 @@ errout: return err; } -static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -604,13 +654,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; - struct hlist_node *node; struct hlist_head *head; int dumped = 0; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) - return ip_rt_dump(skb, cb); + return skb->len; s_h = cb->args[0]; s_e = cb->args[1]; @@ -618,7 +667,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, node, head, tb_hlist) { + hlist_for_each_entry(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -879,16 +928,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) .flowi4_scope = frn->fl_scope, }; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; - rcu_read_lock(); frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { @@ -897,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) frn->type = res.type; frn->scope = res.scope; } - rcu_read_unlock(); local_bh_enable(); } } @@ -908,35 +951,38 @@ static void nl_fib_input(struct sk_buff *skb) struct fib_result_nl *frn; struct nlmsghdr *nlh; struct fib_table *tb; - u32 pid; + u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); - if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || - nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) + if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(*frn)) return; - skb = skb_clone(skb, GFP_KERNEL); + skb = netlink_skb_clone(skb, GFP_KERNEL); if (skb == NULL) return; nlh = nlmsg_hdr(skb); - frn = (struct fib_result_nl *) NLMSG_DATA(nlh); + frn = (struct fib_result_nl *) nlmsg_data(nlh); tb = fib_get_table(net, frn->tb_id_in); nl_fib_lookup(frn, tb); - pid = NETLINK_CB(skb).pid; /* pid of sending process */ - NETLINK_CB(skb).pid = 0; /* from kernel */ + portid = NETLINK_CB(skb).portid; /* netlink portid */ + NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ - netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); + netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, - nl_fib_input, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = nl_fib_input, + }; + + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (sk == NULL) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; @@ -949,11 +995,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force, int delay) +static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), delay); + rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } @@ -970,7 +1016,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -979,9 +1025,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1, 0); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); } break; } @@ -990,15 +1036,17 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct in_device *in_dev; struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2, -1); + fib_disable_ip(dev, 2); + rt_flush_dev(dev); return NOTIFY_DONE; } + in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; @@ -1011,21 +1059,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0, 0); + fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(dev_net(dev), 0); - break; - case NETDEV_UNREGISTER_BATCH: - /* The batch unregister is only called on the first - * device in the list of devices being unregistered. - * Therefore we should not pass dev_net(dev) in here. - */ - rt_cache_flush_batch(NULL); + rt_cache_flush(net); break; } return NOTIFY_DONE; @@ -1073,11 +1114,11 @@ static void ip_fib_net_exit(struct net *net) for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; head = &net->ipv4.fib_table_hash[i]; - hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { - hlist_del(node); + hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { + hlist_del(&tb->tb_hlist); fib_table_flush(tb); fib_free_table(tb); } @@ -1090,6 +1131,9 @@ static int __net_init fib_net_init(struct net *net) { int error; +#ifdef CONFIG_IP_ROUTE_CLASSID + net->ipv4.fib_num_tclassid_users = 0; +#endif error = ip_fib_net_init(net); if (error < 0) goto out; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index af0f14aba16..1e4f6600b31 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -24,21 +24,15 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -extern void fib_release_info(struct fib_info *); -extern struct fib_info *fib_create_info(struct fib_config *cfg); -extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); -extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, - int dst_len, u8 tos, struct fib_info *fi, - unsigned int); -extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, struct nl_info *info, - unsigned int nlm_flags); -extern struct fib_alias *fib_find_alias(struct list_head *fah, - u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, - int *last_idx, int dflt); +void fib_release_info(struct fib_info *); +struct fib_info *fib_create_info(struct fib_config *cfg); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, + u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, + unsigned int); +void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, + u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2d043f71ef7..f2e15738534 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,14 +47,7 @@ struct fib4_rule { #endif }; -#ifdef CONFIG_IP_ROUTE_CLASSID -u32 fib_rules_tclass(const struct fib_result *res) -{ - return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; -} -#endif - -int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -63,11 +56,15 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) int err; err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); - res->r = arg.rule; - +#ifdef CONFIG_IP_ROUTE_CLASSID + if (arg.rule) + res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; + else + res->tclassid = 0; +#endif return err; } -EXPORT_SYMBOL_GPL(fib_lookup); +EXPORT_SYMBOL_GPL(__fib_lookup); static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) @@ -104,6 +101,33 @@ errout: return err; } +static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct fib_result *result = (struct fib_result *) arg->result; + struct net_device *dev = NULL; + + if (result->fi) + dev = result->fi->fib_dev; + + /* do not accept result if the route does + * not meet the required prefix length + */ + if (result->prefixlen <= rule->suppress_prefixlen) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: + if (!(arg->flags & FIB_LOOKUP_NOREF)) + fib_info_put(result->fi); + return true; +} static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -169,8 +193,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dst = nla_get_be32(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID - if (tb[FRA_FLOW]) + if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); + if (rule4->tclassid) + net->ipv4.fib_num_tclassid_users++; + } #endif rule4->src_len = frh->src_len; @@ -179,11 +206,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dstmask = inet_make_mask(rule4->dst_len); rule4->tos = frh->tos; + net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; } +static void fib4_rule_delete(struct fib_rule *rule) +{ + struct net *net = rule->fr_net; +#ifdef CONFIG_IP_ROUTE_CLASSID + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (rule4->tclassid) + net->ipv4.fib_num_tclassid_users--; +#endif + net->ipv4.fib_has_custom_rules = true; +} + static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -246,16 +286,18 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(ops->fro_net, -1); + rt_cache_flush(ops->fro_net); } -static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, + .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, + .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .default_pref = fib_default_rule_pref, @@ -295,6 +337,7 @@ int __net_init fib4_rules_init(struct net *net) if (err < 0) goto fail; net->ipv4.rules_ops = ops; + net->ipv4.fib_has_custom_rules = false; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e5b7182fa09..b10cd43a472 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,6 +140,63 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; +static void rt_fibinfo_free(struct rtable __rcu **rtp) +{ + struct rtable *rt = rcu_dereference_protected(*rtp, 1); + + if (!rt) + return; + + /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); + * because we waited an RCU grace period before calling + * free_fib_info_rcu() + */ + + dst_free(&rt->dst); +} + +static void free_nh_exceptions(struct fib_nh *nh) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + fnhe = rcu_dereference_protected(hash[i].chain, 1); + while (fnhe) { + struct fib_nh_exception *next; + + next = rcu_dereference_protected(fnhe->fnhe_next, 1); + + rt_fibinfo_free(&fnhe->fnhe_rth_input); + rt_fibinfo_free(&fnhe->fnhe_rth_output); + + kfree(fnhe); + + fnhe = next; + } + } + kfree(hash); +} + +static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) +{ + int cpu; + + if (!rtp) + return; + + for_each_possible_cpu(cpu) { + struct rtable *rt; + + rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); + if (rt) + dst_free(&rt->dst); + } + free_percpu(rtp); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -148,6 +205,10 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + if (nexthop_nh->nh_exceptions) + free_nh_exceptions(nexthop_nh); + rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); + rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); @@ -163,6 +224,12 @@ void free_fib_info(struct fib_info *fi) return; } fib_info_cnt--; +#ifdef CONFIG_IP_ROUTE_CLASSID + change_nexthops(fi) { + if (nexthop_nh->nh_tclassid) + fi->fib_net->ipv4.fib_num_tclassid_users--; + } endfor_nexthops(fi); +#endif call_rcu(&fi->rcu, free_fib_info_rcu); } @@ -232,14 +299,13 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) static struct fib_info *fib_find_info(const struct fib_info *nfi) { struct hlist_head *head; - struct hlist_node *node; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn(nfi); head = &fib_info_hash[hash]; - hlist_for_each_entry(fi, node, head, fib_hash) { + hlist_for_each_entry(fi, head, fib_hash) { if (!net_eq(fi->fib_net, nfi->fib_net)) continue; if (fi->fib_nhs != nfi->fib_nhs) @@ -248,6 +314,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && + nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && @@ -264,7 +331,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; - struct hlist_node *node; struct fib_nh *nh; unsigned int hash; @@ -272,7 +338,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; - hlist_for_each_entry(nh, node, head, nh_hash) { + hlist_for_each_entry(nh, head, nh_hash) { if (nh->nh_dev == dev && nh->nh_gw == gw && !(nh->nh_flags & RTNH_F_DEAD)) { @@ -314,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, struct nl_info *info, + int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; @@ -325,7 +391,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, if (skb == NULL) goto errout; - err = fib_dump_info(skb, info->pid, seq, event, tb_id, + err = fib_dump_info(skb, info->portid, seq, event, tb_id, fa->fa_type, key, dst_len, fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { @@ -334,7 +400,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, + rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); return; errout: @@ -360,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt) +static int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, + int dflt) { struct neighbour *n; int state = NUD_NONE; @@ -421,6 +488,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + if (nexthop_nh->nh_tclassid) + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif } @@ -562,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, .flowi4_oif = nh->nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ @@ -652,10 +722,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; - struct hlist_node *node, *n; + struct hlist_node *n; struct fib_info *fi; - hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { + hlist_for_each_entry_safe(fi, n, head, fib_hash) { struct hlist_head *dest; unsigned int new_hash; @@ -670,10 +740,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, for (i = 0; i < old_size; i++) { struct hlist_head *lhead = &fib_info_laddrhash[i]; - struct hlist_node *node, *n; + struct hlist_node *n; struct fib_info *fi; - hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { struct hlist_head *ldest; unsigned int new_hash; @@ -734,7 +804,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) unsigned int bytes; if (!new_size) - new_size = 1; + new_size = 16; bytes = new_size * sizeof(struct hlist_head *); new_info_hash = fib_info_hash_alloc(bytes); new_laddrhash = fib_info_hash_alloc(bytes); @@ -751,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); if (!fi->fib_metrics) goto failure; } else fi->fib_metrics = (u32 *) dst_default_metrics; - fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; @@ -765,10 +835,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; + fi->fib_type = cfg->fc_type; fi->fib_nhs = nhs; change_nexthops(fi) { nexthop_nh->nh_parent = fi; + nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); + if (!nexthop_nh->nh_pcpu_rth_output) + goto failure; } endfor_nexthops(fi) if (cfg->fc_mx) { @@ -779,9 +853,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int type = nla_type(nla); if (type) { + u32 val; + if (type > RTAX_MAX) goto err_inval; - fi->fib_metrics[type - 1] = nla_get_u32(nla); + val = nla_get_u32(nla); + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + fi->fib_metrics[type - 1] = val; } } } @@ -810,6 +891,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; @@ -911,14 +994,14 @@ failure: return ERR_PTR(err); } -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, +int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; @@ -1014,13 +1097,12 @@ int fib_sync_down_addr(struct net *net, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; - struct hlist_node *node; struct fib_info *fi; if (fib_info_laddrhash == NULL || local == 0) return 0; - hlist_for_each_entry(fi, node, head, fib_lhash) { + hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net)) continue; if (fi->fib_prefsrc == local) { @@ -1038,13 +1120,12 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct fib_info *prev_fi = NULL; unsigned int hash = fib_devindex_hashfn(dev->ifindex); struct hlist_head *head = &fib_info_devhash[hash]; - struct hlist_node *node; struct fib_nh *nh; if (force) scope = -1; - hlist_for_each_entry(nh, node, head, nh_hash) { + hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int dead; @@ -1150,7 +1231,6 @@ int fib_sync_up(struct net_device *dev) struct fib_info *prev_fi; unsigned int hash; struct hlist_head *head; - struct hlist_node *node; struct fib_nh *nh; int ret; @@ -1162,7 +1242,7 @@ int fib_sync_up(struct net_device *dev) head = &fib_info_devhash[hash]; ret = 0; - hlist_for_each_entry(nh, node, head, nh_hash) { + hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int alive; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 30b88d7b4bd..5afeb5aa4c7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -71,7 +71,6 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/prefetch.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> @@ -125,7 +124,6 @@ struct tnode { unsigned int empty_children; /* KEYLENGTH bits needed */ union { struct rcu_head rcu; - struct work_struct work; struct tnode *tnode_free; }; struct rt_trie_node __rcu *child[0]; @@ -159,7 +157,6 @@ struct trie { #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); @@ -368,7 +365,7 @@ static void __leaf_free_rcu(struct rcu_head *head) static inline void free_leaf(struct leaf *l) { - call_rcu_bh(&l->rcu, __leaf_free_rcu); + call_rcu(&l->rcu, __leaf_free_rcu); } static inline void free_leaf_info(struct leaf_info *leaf) @@ -384,12 +381,6 @@ static struct tnode *tnode_alloc(size_t size) return vzalloc(size); } -static void __tnode_vfree(struct work_struct *arg) -{ - struct tnode *tn = container_of(arg, struct tnode, work); - vfree(tn); -} - static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); @@ -398,10 +389,8 @@ static void __tnode_free_rcu(struct rcu_head *head) if (size <= PAGE_SIZE) kfree(tn); - else { - INIT_WORK(&tn->work, __tnode_vfree); - schedule_work(&tn->work); - } + else + vfree(tn); } static inline void tnode_free(struct tnode *tn) @@ -473,7 +462,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct rt_trie_node) << bits); + sizeof(struct rt_trie_node *) << bits); return tn; } @@ -490,7 +479,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node * return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, +static inline void put_child(struct tnode *tn, int i, struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); @@ -754,8 +743,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct rt_trie_node *) left); - put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); + put_child(tn, 2*i, (struct rt_trie_node *) left); + put_child(tn, 2*i+1, (struct rt_trie_node *) right); } } @@ -773,12 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if (tkey_extract_bits(node->key, - oldtnode->pos + oldtnode->bits, - 1) == 0) - put_child(t, tn, 2*i, node); - else - put_child(t, tn, 2*i+1, node); + put_child(tn, + tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), + node); continue; } @@ -786,8 +772,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); - put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); + put_child(tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -817,22 +803,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) */ left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + put_child(tn, 2*i, NULL); BUG_ON(!left); right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + put_child(tn, 2*i+1, NULL); BUG_ON(!right); size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, rtnl_dereference(inode->child[j])); - put_child(t, right, j, rtnl_dereference(inode->child[j + size])); + put_child(left, j, rtnl_dereference(inode->child[j])); + put_child(right, j, rtnl_dereference(inode->child[j + size])); } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + put_child(tn, 2*i, resize(t, left)); + put_child(tn, 2*i+1, resize(t, right)); tnode_free_safe(inode); } @@ -877,7 +863,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct rt_trie_node *)newn); + put_child(tn, i/2, (struct rt_trie_node *)newn); } } @@ -892,21 +878,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (left == NULL) { if (right == NULL) /* Both are empty */ continue; - put_child(t, tn, i/2, right); + put_child(tn, i/2, right); continue; } if (right == NULL) { - put_child(t, tn, i/2, left); + put_child(tn, i/2, left); continue; } /* Two nonempty children */ newBinNode = (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); + put_child(tn, i/2, NULL); + put_child(newBinNode, 0, left); + put_child(newBinNode, 1, right); + put_child(tn, i/2, resize(t, newBinNode)); } tnode_free_safe(oldtnode); return tn; @@ -921,10 +907,9 @@ nomem: static struct leaf_info *find_leaf_info(struct leaf *l, int plen) { struct hlist_head *head = &l->list; - struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry_rcu(li, node, head, hlist) + hlist_for_each_entry_rcu(li, head, hlist) if (li->plen == plen) return li; @@ -944,12 +929,11 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen) static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node; if (hlist_empty(head)) { hlist_add_head_rcu(&new->hlist, head); } else { - hlist_for_each_entry(li, node, head, hlist) { + hlist_for_each_entry(li, head, hlist) { if (new->plen > li->plen) break; @@ -1007,9 +991,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = (struct tnode *) resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, tn); - tnode_put_child_reorg((struct tnode *)tp, cindex, + tnode_put_child_reorg(tp, cindex, (struct rt_trie_node *)tn, wasfull); tp = node_parent((struct rt_trie_node *) tn); @@ -1024,7 +1008,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = (struct tnode *)resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, tn); rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); @@ -1125,7 +1109,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); + put_child(tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1133,12 +1117,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) * first tnode need some special handling */ - if (tp) - pos = tp->pos+tp->bits; - else - pos = 0; - if (n) { + pos = tp ? tp->pos+tp->bits : 0; newpos = tkey_mismatch(key, pos, n->key); tn = tnode_new(n->key, newpos, 1); } else { @@ -1155,13 +1135,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct rt_trie_node *)l); - put_child(t, tn, 1-missbit, n); + put_child(tn, missbit, (struct rt_trie_node *)l); + put_child(tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, - (struct rt_trie_node *)tn); + put_child(tp, cindex, (struct rt_trie_node *)tn); } else { rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; @@ -1288,7 +1267,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1335,7 +1314,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1356,9 +1335,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, { struct leaf_info *li; struct hlist_head *hhead = &l->list; - struct hlist_node *node; - hlist_for_each_entry_rcu(li, node, hhead, hlist) { + hlist_for_each_entry_rcu(li, hhead, hlist) { struct fib_alias *fa; if (l->key != (key & li->mask_plen)) @@ -1552,7 +1530,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, * state.directly. */ if (pref_mismatch) { - int mp = KEYLENGTH - fls(pref_mismatch); + /* fls(x) = __fls(x) + 1 */ + int mp = KEYLENGTH - __fls(pref_mismatch) - 1; if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) goto backtrace; @@ -1620,7 +1599,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) if (tp) { t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, NULL); + put_child(tp, cindex, NULL); trie_rebalance(t, tp); } else RCU_INIT_POINTER(t->trie, NULL); @@ -1657,7 +1636,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!l) return -ESRCH; - fa_head = get_fa_head(l, plen); + li = find_leaf_info(l, plen); + + if (!li) + return -ESRCH; + + fa_head = &li->falh; fa = fib_find_alias(fa_head, tos, 0); if (!fa) @@ -1693,9 +1677,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); - l = fib_find_node(t, key); - li = find_leaf_info(l, plen); - list_del_rcu(&fa->fa_list); if (!plen) @@ -1710,7 +1691,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1739,10 +1720,10 @@ static int trie_flush_leaf(struct leaf *l) { int found = 0; struct hlist_head *lih = &l->list; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; struct leaf_info *li = NULL; - hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { + hlist_for_each_entry_safe(li, tmp, lih, hlist) { found += trie_flush_list(&li->falh); if (list_empty(&li->falh)) { @@ -1772,10 +1753,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) if (!c) continue; - if (IS_LEAF(c)) { - prefetch(rcu_dereference_rtnl(p->child[idx])); + if (IS_LEAF(c)) return (struct leaf *) c; - } /* Rescan start scanning in new node */ p = (struct tnode *) c; @@ -1872,7 +1851,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, continue; } - if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, tb->tb_id, @@ -1894,14 +1873,13 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { struct leaf_info *li; - struct hlist_node *node; int i, s_i; s_i = cb->args[4]; i = 0; /* rcu_read_lock is hold by caller */ - hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &l->list, hlist) { if (i < s_i) { i++; continue; @@ -2091,14 +2069,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) if (IS_LEAF(n)) { struct leaf *l = (struct leaf *)n; struct leaf_info *li; - struct hlist_node *tmp; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; - hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) + hlist_for_each_entry_rcu(li, &l->list, hlist) ++s->prefixes; } else { const struct tnode *tn = (const struct tnode *) n; @@ -2146,7 +2123,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) max--; pointers = 0; - for (i = 1; i <= max; i++) + for (i = 1; i < max; i++) if (stat->nodesizes[i] != 0) { seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; @@ -2199,10 +2176,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; - struct hlist_node *node; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; @@ -2244,10 +2220,9 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; - struct hlist_node *node; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct rt_trie_node *n; for (n = fib_trie_get_first(iter, @@ -2297,7 +2272,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; @@ -2380,13 +2355,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) } else { struct leaf *l = (struct leaf *) n; struct leaf_info *li; - struct hlist_node *node; __be32 val = htonl(l->key); seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); - hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &l->list, hlist) { struct fib_alias *fa; list_for_each_entry_rcu(fa, &li->falh, fa_list) { @@ -2531,7 +2505,6 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct leaf *l = v; struct leaf_info *li; - struct hlist_node *node; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2540,7 +2513,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &l->list, hlist) { struct fib_alias *fa; __be32 mask, prefix; @@ -2550,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); - int len; if (fa->fa_type == RTN_BROADCAST || fa->fa_type == RTN_MULTICAST) continue; + seq_setwidth(seq, 127); + if (fi) seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u%n", + "%d\t%08X\t%d\t%u\t%u", fi->fib_dev ? fi->fib_dev->name : "*", prefix, fi->fib_nh->nh_gw, flags, 0, 0, @@ -2568,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, - fi->fib_rtt >> 3, &len); + fi->fib_rtt >> 3); else seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u%n", + "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, - mask, 0, 0, 0, &len); + mask, 0, 0, 0); - seq_printf(seq, "%*s\n", 127 - len, ""); + seq_pad(seq, '\n'); } } @@ -2606,31 +2580,31 @@ static const struct file_operations fib_route_fops = { int __net_init fib_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) + if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) goto out1; - if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, - &fib_triestat_fops)) + if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, + &fib_triestat_fops)) goto out2; - if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) + if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) goto out3; return 0; out3: - proc_net_remove(net, "fib_triestat"); + remove_proc_entry("fib_triestat", net->proc_net); out2: - proc_net_remove(net, "fib_trie"); + remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { - proc_net_remove(net, "fib_trie"); - proc_net_remove(net, "fib_triestat"); - proc_net_remove(net, "route"); + remove_proc_entry("fib_trie", net->proc_net); + remove_proc_entry("fib_triestat", net->proc_net); + remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c deleted file mode 100644 index 42a491055c7..00000000000 --- a/net/ipv4/gre.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * GRE over IPv4 demultiplexer driver - * - * Authors: Dmitry Kozlov (xeb@mail.ru) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/netdevice.h> -#include <linux/spinlock.h> -#include <net/protocol.h> -#include <net/gre.h> - - -static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; -static DEFINE_SPINLOCK(gre_proto_lock); - -int gre_add_protocol(const struct gre_protocol *proto, u8 version) -{ - if (version >= GREPROTO_MAX) - goto err_out; - - spin_lock(&gre_proto_lock); - if (gre_proto[version]) - goto err_out_unlock; - - RCU_INIT_POINTER(gre_proto[version], proto); - spin_unlock(&gre_proto_lock); - return 0; - -err_out_unlock: - spin_unlock(&gre_proto_lock); -err_out: - return -1; -} -EXPORT_SYMBOL_GPL(gre_add_protocol); - -int gre_del_protocol(const struct gre_protocol *proto, u8 version) -{ - if (version >= GREPROTO_MAX) - goto err_out; - - spin_lock(&gre_proto_lock); - if (rcu_dereference_protected(gre_proto[version], - lockdep_is_held(&gre_proto_lock)) != proto) - goto err_out_unlock; - RCU_INIT_POINTER(gre_proto[version], NULL); - spin_unlock(&gre_proto_lock); - synchronize_rcu(); - return 0; - -err_out_unlock: - spin_unlock(&gre_proto_lock); -err_out: - return -1; -} -EXPORT_SYMBOL_GPL(gre_del_protocol); - -static int gre_rcv(struct sk_buff *skb) -{ - const struct gre_protocol *proto; - u8 ver; - int ret; - - if (!pskb_may_pull(skb, 12)) - goto drop; - - ver = skb->data[1]&0x7f; - if (ver >= GREPROTO_MAX) - goto drop; - - rcu_read_lock(); - proto = rcu_dereference(gre_proto[ver]); - if (!proto || !proto->handler) - goto drop_unlock; - ret = proto->handler(skb); - rcu_read_unlock(); - return ret; - -drop_unlock: - rcu_read_unlock(); -drop: - kfree_skb(skb); - return NET_RX_DROP; -} - -static void gre_err(struct sk_buff *skb, u32 info) -{ - const struct gre_protocol *proto; - const struct iphdr *iph = (const struct iphdr *)skb->data; - u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; - - if (ver >= GREPROTO_MAX) - return; - - rcu_read_lock(); - proto = rcu_dereference(gre_proto[ver]); - if (proto && proto->err_handler) - proto->err_handler(skb, info); - rcu_read_unlock(); -} - -static const struct net_protocol net_gre_protocol = { - .handler = gre_rcv, - .err_handler = gre_err, - .netns_ok = 1, -}; - -static int __init gre_init(void) -{ - pr_info("GRE over IPv4 demultiplexor driver\n"); - - if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { - pr_err("can't add protocol\n"); - return -EAGAIN; - } - - return 0; -} - -static void __exit gre_exit(void) -{ - inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); -} - -module_init(gre_init); -module_exit(gre_exit); - -MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); -MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); -MODULE_LICENSE("GPL"); - diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c new file mode 100644 index 00000000000..0485bf7f8f0 --- /dev/null +++ b/net/ipv4/gre_demux.c @@ -0,0 +1,364 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/if.h> +#include <linux/icmp.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <linux/if_tunnel.h> +#include <linux/spinlock.h> +#include <net/protocol.h> +#include <net/gre.h> + +#include <net/icmp.h> +#include <net/route.h> +#include <net/xfrm.h> + +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; +static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX]; + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + return -EINVAL; + + return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? + 0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + int ret; + + if (version >= GREPROTO_MAX) + return -EINVAL; + + ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? + 0 : -EBUSY; + + if (ret) + return ret; + + synchronize_rcu(); + return 0; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len) +{ + struct gre_base_hdr *greh; + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(tpi->flags); + greh->protocol = tpi->proto; + + if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + + if (tpi->flags&TUNNEL_SEQ) { + *ptr = tpi->seq; + ptr--; + } + if (tpi->flags&TUNNEL_KEY) { + *ptr = tpi->key; + ptr--; + } + if (tpi->flags&TUNNEL_CSUM && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { + *ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); + } + } +} +EXPORT_SYMBOL_GPL(gre_build_header); + +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err) +{ + unsigned int ip_hlen = ip_hdrlen(skb); + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = ip_gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (skb_checksum_simple_validate(skb)) { + *csum_err = true; + return -EINVAL; + } + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else + tpi->key = 0; + + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else + tpi->seq = 0; + + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) { + hdr_len += 4; + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + } + } + + return iptunnel_pull_header(skb, hdr_len, tpi->proto); +} + +static int gre_cisco_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + int i; + bool csum_err = false; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + + if (parse_gre_header(skb, &tpi, &csum_err) < 0) + goto drop; + + rcu_read_lock(); + for (i = 0; i < GRE_IP_PROTO_MAX; i++) { + struct gre_cisco_protocol *proto; + int ret; + + proto = rcu_dereference(gre_cisco_proto_list[i]); + if (!proto) + continue; + ret = proto->handler(skb, &tpi); + if (ret == PACKET_RCVD) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: + kfree_skb(skb); + return 0; +} + +static void gre_cisco_err(struct sk_buff *skb, u32 info) +{ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + * + * Moreover, Cisco "wise men" put GRE key to the third word + * in GRE header. It makes impossible maintaining even soft + * state for keyed + * GRE tunnels with enabled checksum. Tell them "thank you". + * + * Well, I wonder, rfc1812 was written by Cisco employee, + * what the hell these idiots break standards established + * by themselves??? + */ + + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct tnl_ptk_info tpi; + bool csum_err = false; + int i; + + if (parse_gre_header(skb, &tpi, &csum_err)) { + if (!csum_err) /* ignore csum errors. */ + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + skb->dev->ifindex, 0, IPPROTO_GRE, 0); + return; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, + IPPROTO_GRE, 0); + return; + } + + rcu_read_lock(); + for (i = 0; i < GRE_IP_PROTO_MAX; i++) { + struct gre_cisco_protocol *proto; + + proto = rcu_dereference(gre_cisco_proto_list[i]); + if (!proto) + continue; + + if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD) + goto out; + + } +out: + rcu_read_unlock(); +} + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + const struct iphdr *iph = (const struct iphdr *)skb->data; + u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + + if (ver >= GREPROTO_MAX) + return; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (proto && proto->err_handler) + proto->err_handler(skb, info); + rcu_read_unlock(); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static const struct gre_protocol ipgre_protocol = { + .handler = gre_cisco_rcv, + .err_handler = gre_cisco_err, +}; + +int gre_cisco_register(struct gre_cisco_protocol *newp) +{ + struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) + &gre_cisco_proto_list[newp->priority]; + + return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(gre_cisco_register); + +int gre_cisco_unregister(struct gre_cisco_protocol *del_proto) +{ + struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) + &gre_cisco_proto_list[del_proto->priority]; + int ret; + + ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL; + + if (ret) + return ret; + + synchronize_net(); + return 0; +} +EXPORT_SYMBOL_GPL(gre_cisco_unregister); + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver\n"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("can't add protocol\n"); + goto err; + } + + if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) { + pr_info("%s: can't add ipgre handler\n", __func__); + goto err_gre; + } + + return 0; +err_gre: + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +err: + return -EAGAIN; +} + +static void __exit gre_exit(void) +{ + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c new file mode 100644 index 00000000000..f0bdd47bbbc --- /dev/null +++ b/net/ipv4/gre_offload.c @@ -0,0 +1,298 @@ +/* + * IPV4 GSO/GRO offload support + * Linux INET implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * GRE GSO support + */ + +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/gre.h> + +static int gre_gso_send_check(struct sk_buff *skb) +{ + if (!skb->encapsulation) + return -EINVAL; + return 0; +} + +static struct sk_buff *gre_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t enc_features; + int ghl; + struct gre_base_hdr *greh; + u16 mac_offset = skb->mac_header; + int mac_len = skb->mac_len; + __be16 protocol = skb->protocol; + int tnl_hlen; + bool csum; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_TCPV6 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + goto out; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + + ghl = skb_inner_network_header(skb) - skb_transport_header(skb); + if (unlikely(ghl < sizeof(*greh))) + goto out; + + csum = !!(greh->flags & GRE_CSUM); + if (csum) + skb->encap_hdr_csum = 1; + + if (unlikely(!pskb_may_pull(skb, ghl))) + goto out; + + /* setup inner skb. */ + skb->protocol = greh->protocol; + skb->encapsulation = 0; + + __skb_pull(skb, ghl); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); + goto out; + } + + skb = segs; + tnl_hlen = skb_tnl_header_len(skb); + do { + __skb_push(skb, ghl); + if (csum) { + __be32 *pcsum; + + if (skb_has_shared_frag(skb)) { + int err; + + err = __skb_linearize(skb); + if (err) { + kfree_skb_list(segs); + segs = ERR_PTR(err); + goto out; + } + } + + skb_reset_transport_header(skb); + + greh = (struct gre_base_hdr *) + skb_transport_header(skb); + pcsum = (__be32 *)(greh + 1); + *pcsum = 0; + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); + } + __skb_push(skb, tnl_hlen - ghl); + + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb->mac_len = mac_len; + skb->protocol = protocol; + } while ((skb = skb->next)); +out: + return segs; +} + +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ + __sum16 sum; + + skb->csum = skb_checksum(skb, 0, skb->len, 0); + NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, + csum_partial(skb->data, skb_gro_offset(skb), 0)); + sum = csum_fold(NAPI_GRO_CB(skb)->csum); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { + if (unlikely(!sum) && !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } else { + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + } + + return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + const struct gre_base_hdr *greh; + unsigned int hlen, grehlen; + unsigned int off; + int flush = 1; + struct packet_offload *ptype; + __be16 type; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*greh); + greh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out; + } + + /* Only support version 0 and K (key), C (csum) flags. Note that + * although the support for the S (seq#) flag can be added easily + * for GRO, this is problematic for GSO hence can not be enabled + * here because a GRO pkt may end up in the forwarding path, thus + * requiring GSO support to break it up correctly. + */ + if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) + goto out; + + type = greh->protocol; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (ptype == NULL) + goto out_unlock; + + grehlen = GRE_HEADER_SECTION; + + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + hlen = off + grehlen; + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out_unlock; + } + if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ + __sum16 csum = 0; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum = csum_fold(NAPI_GRO_CB(skb)->csum); + /* Don't trust csum error calculated/reported by h/w */ + if (skb->ip_summed == CHECKSUM_NONE || csum != 0) + csum = gro_skb_checksum(skb); + + /* GRE CSUM is the 1's complement of the 1's complement sum + * of the GRE hdr plus payload so it should add up to 0xffff + * (and 0 after csum_fold()) just like the IPv4 hdr csum. + */ + if (csum) + goto out_unlock; + } + flush = 0; + + for (p = *head; p; p = p->next) { + const struct gre_base_hdr *greh2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + /* The following checks are needed to ensure only pkts + * from the same tunnel are considered for aggregation. + * The criteria for "the same tunnel" includes: + * 1) same version (we only support version 0 here) + * 2) same protocol (we only support ETH_P_IP for now) + * 3) same set of flags + * 4) same key if the key field is present. + */ + greh2 = (struct gre_base_hdr *)(p->data + off); + + if (greh2->flags != greh->flags || + greh2->protocol != greh->protocol) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + if (greh->flags & GRE_KEY) { + /* compare keys */ + if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + } + + skb_gro_pull(skb, grehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, greh, grehlen); + + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); + struct packet_offload *ptype; + unsigned int grehlen = sizeof(*greh); + int err = -ENOENT; + __be16 type; + + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type = SKB_GSO_GRE; + + type = greh->protocol; + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype != NULL) + err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + + rcu_read_unlock(); + return err; +} + +static const struct net_offload gre_offload = { + .callbacks = { + .gso_send_check = gre_gso_send_check, + .gso_segment = gre_gso_segment, + .gro_receive = gre_gro_receive, + .gro_complete = gre_gro_complete, + }, +}; + +static int __init gre_offload_init(void) +{ + return inet_add_offload(&gre_offload, IPPROTO_GRE); +} +device_initcall(gre_offload_init); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c75efbdc71c..42b7bcf8045 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -95,6 +95,7 @@ #include <net/checksum.h> #include <net/xfrm.h> #include <net/inet_common.h> +#include <net/ip_fib.h> /* * Build xmit assembly blocks @@ -253,10 +254,11 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, /* Limit if icmp type is enabled in ratemask. */ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - if (!rt->peer) - rt_bind_peer(rt, fl4->daddr, 1); - rc = inet_peer_xrlim_allow(rt->peer, + struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); + if (peer) + inet_putpeer(peer); } out: return rc; @@ -334,7 +336,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; - __be32 daddr; + __be32 daddr, saddr; + u32 mark = IP4_REPLY_MARK(net, skb->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; @@ -347,9 +350,14 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; + sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; + saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; + if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; if (ipc.opt->opt.srr) @@ -357,7 +365,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) } memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = rt->rt_spec_dst; + fl4.saddr = saddr; + fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); @@ -376,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, const struct iphdr *iph, - __be32 saddr, u8 tos, + __be32 saddr, u8 tos, u32 mark, int type, int code, struct icmp_bxm *param) { @@ -388,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->daddr = (param->replyopts.opt.opt.srr ? param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; + fl4->flowi4_mark = mark; fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; @@ -479,12 +489,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct iphdr *iph; int room; - struct icmp_bxm icmp_param; + struct icmp_bxm *icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; u8 tos; + u32 mark; struct net *net; struct sock *sk; @@ -500,7 +511,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph = ip_hdr(skb_in); if ((u8 *)iph < skb_in->head || - (skb_in->network_header + sizeof(*iph)) > skb_in->tail) + (skb_network_header(skb_in) + sizeof(*iph)) > + skb_tail_pointer(skb_in)) goto out; /* @@ -554,9 +566,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } + icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); + if (!icmp_param) + return; + sk = icmp_xmit_lock(net); if (sk == NULL) - return; + goto out_free; /* * Construct source address and options. @@ -569,7 +585,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->rt_iif); + dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -581,8 +597,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) goto out_unlock; @@ -590,19 +607,22 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param.data.icmph.type = type; - icmp_param.data.icmph.code = code; - icmp_param.data.icmph.un.gateway = info; - icmp_param.data.icmph.checksum = 0; - icmp_param.skb = skb_in; - icmp_param.offset = skb_network_offset(skb_in); + icmp_param->data.icmph.type = type; + icmp_param->data.icmph.code = code; + icmp_param->data.icmph.un.gateway = info; + icmp_param->data.icmph.checksum = 0; + icmp_param->skb = skb_in; + icmp_param->offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; + sk->sk_mark = mark; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts.opt; + ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, - type, code, &icmp_param); + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, + type, code, icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -614,36 +634,68 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); - icmp_param.data_len = skb_in->len - icmp_param.offset; - if (icmp_param.data_len > room) - icmp_param.data_len = room; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data_len = skb_in->len - icmp_param->offset; + if (icmp_param->data_len > room) + icmp_param->data_len = room; + icmp_param->head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); +out_free: + kfree(icmp_param); out:; } EXPORT_SYMBOL(icmp_send); +static void icmp_socket_deliver(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + return; + + raw_icmp_error(skb, protocol, info); + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); +} + +static bool icmp_tag_validation(int proto) +{ + bool ok; + + rcu_read_lock(); + ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; + rcu_read_unlock(); + return ok; +} + /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * ICMP_PARAMETERPROB. */ static void icmp_unreach(struct sk_buff *skb) { const struct iphdr *iph; struct icmphdr *icmph; - int hash, protocol; - const struct net_protocol *ipprot; - u32 info = 0; struct net *net; + u32 info = 0; net = dev_net(skb_dst(skb)->dev); @@ -670,15 +722,23 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { + /* for documentation of the ip_no_pmtu_disc + * values please see + * Documentation/networking/ip-sysctl.txt + */ + switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + default: LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); - } else { - info = ip_rt_frag_needed(net, iph, - ntohs(icmph->un.frag.mtu), - skb->dev); - if (!info) + break; + case 2: + goto out; + case 3: + if (!icmp_tag_validation(iph->protocol)) goto out; + /* fall through */ + case 0: + info = ntohs(icmph->un.frag.mtu); } break; case ICMP_SR_FAILED: @@ -720,26 +780,7 @@ static void icmp_unreach(struct sk_buff *skb) goto out; } - /* Checkin full IP header plus 8 bytes of protocol to - * avoid additional coding at protocol handlers. - */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) - goto out; - - iph = (const struct iphdr *)skb->data; - protocol = iph->protocol; - - /* - * Deliver ICMP message to raw sockets. Pretty useless feature? - */ - raw_icmp_error(skb, protocol, info); - - hash = protocol & (MAX_INET_PROTOS - 1); - rcu_read_lock(); - ipprot = rcu_dereference(inet_protos[hash]); - if (ipprot && ipprot->err_handler) - ipprot->err_handler(skb, info); - rcu_read_unlock(); + icmp_socket_deliver(skb, info); out: return; @@ -755,46 +796,15 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { - const struct iphdr *iph; - - if (skb->len < sizeof(struct iphdr)) - goto out_err; - - /* - * Get the copied header of the packet that caused the redirect - */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - - iph = (const struct iphdr *)skb->data; - - switch (icmp_hdr(skb)->code & 7) { - case ICMP_REDIR_NET: - case ICMP_REDIR_NETTOS: - /* - * As per RFC recommendations now handle it as a host redirect. - */ - case ICMP_REDIR_HOST: - case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, - icmp_hdr(skb)->un.gateway, - iph->saddr, skb->dev); - break; + if (skb->len < sizeof(struct iphdr)) { + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + return; } - /* Ping wants to see redirects. - * Let's pretend they are errors of sorts... */ - if (iph->protocol == IPPROTO_ICMP && - iph->ihl >= 5 && - pskb_may_pull(skb, (iph->ihl<<2)+8)) { - ping_err(skb, icmp_hdr(skb)->un.gateway); - } + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return; -out: - return; -out_err: - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - goto out; + icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); } /* @@ -868,86 +878,6 @@ out_err: goto out; } - -/* - * Handle ICMP_ADDRESS_MASK requests. (RFC950) - * - * RFC1122 (3.2.2.9). A host MUST only send replies to - * ADDRESS_MASK requests if it's been configured as an address mask - * agent. Receiving a request doesn't constitute implicit permission to - * act as one. Of course, implementing this correctly requires (SHOULD) - * a way to turn the functionality on and off. Another one for sysctl(), - * I guess. -- MS - * - * RFC1812 (4.3.3.9). A router MUST implement it. - * A router SHOULD have switch turning it on/off. - * This switch MUST be ON by default. - * - * Gratuitous replies, zero-source replies are not implemented, - * that complies with RFC. DO NOT implement them!!! All the idea - * of broadcast addrmask replies as specified in RFC950 is broken. - * The problem is that it is not uncommon to have several prefixes - * on one physical interface. Moreover, addrmask agent can even be - * not aware of existing another prefixes. - * If source is zero, addrmask agent cannot choose correct prefix. - * Gratuitous mask announcements suffer from the same problem. - * RFC1812 explains it, but still allows to use ADDRMASK, - * that is pretty silly. --ANK - * - * All these rules are so bizarre, that I removed kernel addrmask - * support at all. It is wrong, it is obsolete, nobody uses it in - * any case. --ANK - * - * Furthermore you can do it with a usermode address agent program - * anyway... - */ - -static void icmp_address(struct sk_buff *skb) -{ -#if 0 - net_dbg_ratelimited("a guy asks for address mask. Who is it?\n"); -#endif -} - -/* - * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain - * loudly if an inconsistency is found. - * called with rcu_read_lock() - */ - -static void icmp_address_reply(struct sk_buff *skb) -{ - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = skb->dev; - struct in_device *in_dev; - struct in_ifaddr *ifa; - - if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) - return; - - in_dev = __in_dev_get_rcu(dev); - if (!in_dev) - return; - - if (in_dev->ifa_list && - IN_DEV_LOG_MARTIANS(in_dev) && - IN_DEV_FORWARD(in_dev)) { - __be32 _mask, *mp; - - mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); - BUG_ON(mp == NULL); - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - if (*mp == ifa->ifa_mask && - inet_ifa_match(ip_hdr(skb)->saddr, ifa)) - break; - } - if (!ifa) - net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n", - mp, - dev->name, &ip_hdr(skb)->saddr); - } -} - static void icmp_discard(struct sk_buff *skb) { } @@ -983,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb) ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto error; - } + if (skb_checksum_simple_validate(skb)) + goto csum_error; if (!pskb_pull(skb, sizeof(*icmph))) goto error; @@ -1039,11 +961,37 @@ int icmp_rcv(struct sk_buff *skb) drop: kfree_skb(skb); return 0; +csum_error: + ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS); error: ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto drop; } +void icmp_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + int offset = iph->ihl<<2; + struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; + struct net *net = dev_net(skb->dev); + + /* + * Use ping_err to handle all icmp errors except those + * triggered by ICMP_ECHOREPLY which sent from kernel. + */ + if (icmph->type != ICMP_ECHOREPLY) { + ping_err(skb, offset, info); + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + else if (type == ICMP_REDIRECT) + ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); +} + /* * This table is the definition of how we handle ICMP. */ @@ -1111,10 +1059,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { .handler = icmp_discard, }, [ICMP_ADDRESS] = { - .handler = icmp_address, + .handler = icmp_discard, }, [ICMP_ADDRESSREPLY] = { - .handler = icmp_address_reply, + .handler = icmp_discard, }, }; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6699f23e6f5..db710b059ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -88,6 +88,7 @@ #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> +#include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/arp.h> @@ -113,7 +114,8 @@ #define IGMP_V1_Router_Present_Timeout (400*HZ) #define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) +#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) #define IGMP_Query_Response_Interval (10*HZ) #define IGMP_Unsolicited_Report_Count 2 @@ -138,6 +140,29 @@ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) +static int unsolicited_report_interval(struct in_device *in_dev) +{ + int interval_ms, interval_jiffies; + + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) + interval_ms = IN_DEV_CONF_GET( + in_dev, + IGMPV2_UNSOLICITED_REPORT_INTERVAL); + else /* v3 */ + interval_ms = IN_DEV_CONF_GET( + in_dev, + IGMPV3_UNSOLICITED_REPORT_INTERVAL); + + interval_jiffies = msecs_to_jiffies(interval_ms); + + /* _timer functions can't handle a delay of 0 jiffies so ensure + * we always return a positive value. + */ + if (interval_jiffies <= 0) + interval_jiffies = 1; + return interval_jiffies; +} + static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); static void igmpv3_clear_delrec(struct in_device *in_dev); @@ -186,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = net_random() % max_delay; + int tv = prandom_u32() % max_delay; im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -195,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = net_random() % in_dev->mr_maxdelay; + int tv = prandom_u32() % in_dev->mr_maxdelay; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) @@ -204,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = net_random() % delay; + int tv = prandom_u32() % delay; if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); @@ -285,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) struct ip_sf_list *psf; int scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) if (size < 256) return NULL; } + skb->priority = TC_PRIO_CONTROL; igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, @@ -343,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(pip, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -363,7 +389,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) static int igmpv3_sendpack(struct sk_buff *skb) { struct igmphdr *pig = igmp_hdr(skb); - const int igmplen = skb->tail - skb->transport_header; + const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); @@ -437,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, } first = 1; psf_prev = NULL; - for (psf=*psf_list; psf; psf=psf_next) { + for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; @@ -494,7 +520,7 @@ empty_source: return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ - if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) { + if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } @@ -550,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf=*ppsf; psf; psf = psf_next) { + for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) @@ -574,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; @@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ip_rt_put(rt); return -1; } + skb->priority = TC_PRIO_CONTROL; skb_dst_set(skb, &rt->dst); @@ -687,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; @@ -709,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data) in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); - __in_dev_put(in_dev); + in_dev_put(in_dev); } static void igmp_ifc_timer_expire(unsigned long data) @@ -719,9 +746,10 @@ static void igmp_ifc_timer_expire(unsigned long data) igmpv3_send_cr(in_dev); if (in_dev->mr_ifc_count) { in_dev->mr_ifc_count--; - igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); + igmp_ifc_start_timer(in_dev, + unsolicited_report_interval(in_dev)); } - __in_dev_put(in_dev); + in_dev_put(in_dev); } static void igmp_ifc_event(struct in_device *in_dev) @@ -736,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev) static void igmp_timer_expire(unsigned long data) { - struct ip_mc_list *im=(struct ip_mc_list *)data; + struct ip_mc_list *im = (struct ip_mc_list *)data; struct in_device *in_dev = im->interface; spin_lock(&im->lock); @@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data) if (im->unsolicit_count) { im->unsolicit_count--; - igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + igmp_start_timer(im, unsolicited_report_interval(in_dev)); } im->reporter = 1; spin_unlock(&im->lock); @@ -766,10 +794,10 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) int i, scount; scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) { + for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != @@ -797,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) /* mark INCLUDE-mode sources */ scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) + for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; @@ -815,14 +843,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) return 1; } -static void igmp_heard_report(struct in_device *in_dev, __be32 group) +/* return true if packet was dropped */ +static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) - return; + return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { @@ -832,9 +861,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group) } } rcu_read_unlock(); + return false; } -static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, +/* return true if packet was dropped */ +static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { struct igmphdr *ih = igmp_hdr(skb); @@ -866,7 +897,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, /* clear deleted report items */ igmpv3_clear_delrec(in_dev); } else if (len < 12) { - return; /* ignore bogus packet; freed by caller */ + return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ max_delay = IGMP_Query_Response_Interval; @@ -883,13 +914,13 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) - return; + return true; ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) - return; + return true; ih3 = igmpv3_query_hdr(skb); } @@ -901,9 +932,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, in_dev->mr_qrv = ih3->qrv; if (!group) { /* general query */ if (ih3->nsrcs) - return; /* no sources allowed */ + return false; /* no sources allowed */ igmp_gq_start_timer(in_dev); - return; + return false; } /* mark sources to include, if group & source-specific */ mark = ih3->nsrcs != 0; @@ -939,6 +970,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, igmp_mod_timer(im, max_delay); } rcu_read_unlock(); + return false; } /* called in rcu_read_lock() section */ @@ -948,6 +980,7 @@ int igmp_rcv(struct sk_buff *skb) struct igmphdr *ih; struct in_device *in_dev = __in_dev_get_rcu(skb->dev); int len = skb->len; + bool dropped = true; if (in_dev == NULL) goto drop; @@ -955,21 +988,13 @@ int igmp_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto drop; - } + if (skb_checksum_simple_validate(skb)) + goto drop; ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: - igmp_heard_query(in_dev, skb, len); + dropped = igmp_heard_query(in_dev, skb, len); break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: @@ -979,7 +1004,7 @@ int igmp_rcv(struct sk_buff *skb) /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) - igmp_heard_report(in_dev, ih->group); + dropped = igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 @@ -997,7 +1022,10 @@ int igmp_rcv(struct sk_buff *skb) } drop: - kfree_skb(skb); + if (dropped) + kfree_skb(skb); + else + consume_skb(skb); return 0; } @@ -1067,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); @@ -1085,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; @@ -1098,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) } spin_unlock_bh(&in_dev->mc_tomb_lock); if (pmc) { - for (psf=pmc->tomb; psf; psf=psf_next) { + for (psf = pmc->tomb; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1131,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); - for (; psf; psf=psf_next) { + for (; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1209,6 +1237,57 @@ static void igmp_group_added(struct ip_mc_list *im) * Multicast list managers */ +static u32 ip_mc_hash(const struct ip_mc_list *im) +{ + return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG); +} + +static void ip_mc_hash_add(struct in_device *in_dev, + struct ip_mc_list *im) +{ + struct ip_mc_list __rcu **mc_hash; + u32 hash; + + mc_hash = rtnl_dereference(in_dev->mc_hash); + if (mc_hash) { + hash = ip_mc_hash(im); + im->next_hash = mc_hash[hash]; + rcu_assign_pointer(mc_hash[hash], im); + return; + } + + /* do not use a hash table for small number of items */ + if (in_dev->mc_count < 4) + return; + + mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG, + GFP_KERNEL); + if (!mc_hash) + return; + + for_each_pmc_rtnl(in_dev, im) { + hash = ip_mc_hash(im); + im->next_hash = mc_hash[hash]; + RCU_INIT_POINTER(mc_hash[hash], im); + } + + rcu_assign_pointer(in_dev->mc_hash, mc_hash); +} + +static void ip_mc_hash_remove(struct in_device *in_dev, + struct ip_mc_list *im) +{ + struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash); + struct ip_mc_list *aux; + + if (!mc_hash) + return; + mc_hash += ip_mc_hash(im); + while ((aux = rtnl_dereference(*mc_hash)) != im) + mc_hash = &aux->next_hash; + *mc_hash = im->next_hash; +} + /* * A socket has joined a multicast group on device dev. @@ -1250,6 +1329,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev->mc_count++; rcu_assign_pointer(in_dev->mc_list, im); + ip_mc_hash_add(in_dev, im); + #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im->multiaddr); #endif @@ -1262,16 +1343,17 @@ out: EXPORT_SYMBOL(ip_mc_inc_group); /* - * Resend IGMP JOIN report; used for bonding. - * Called with rcu_read_lock() + * Resend IGMP JOIN report; used by netdev notifier. */ -void ip_mc_rejoin_groups(struct in_device *in_dev) +static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; - for_each_pmc_rcu(in_dev, im) { + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; @@ -1288,7 +1370,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev) } #endif } -EXPORT_SYMBOL(ip_mc_rejoin_groups); /* * A socket has left a multicast group on device dev @@ -1306,6 +1387,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) ip = &i->next_rcu) { if (i->multiaddr == addr) { if (--i->users == 0) { + ip_mc_hash_remove(in_dev, i); *ip = i->next_rcu; in_dev->mc_count--; igmp_group_dropped(i); @@ -1373,13 +1455,9 @@ void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); - in_dev->mc_tomb = NULL; #ifdef CONFIG_IP_MULTICAST - in_dev->mr_gq_running = 0; setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, (unsigned long)in_dev); - in_dev->mr_ifc_count = 0; - in_dev->mc_count = 0; setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; @@ -1471,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1544,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[sfmode]--; } err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; @@ -1564,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { @@ -1585,7 +1663,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1613,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc) struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -1630,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc) int new_in, rv; rv = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -1640,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; @@ -1662,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { @@ -1714,7 +1792,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]++; err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; @@ -1724,7 +1802,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]--; - for (j=0; j<i; j++) + for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST @@ -1743,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { @@ -1758,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *nextpsf; - for (psf=pmc->tomb; psf; psf=nextpsf) { + for (psf = pmc->tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } pmc->tomb = NULL; - for (psf=pmc->sources; psf; psf=nextpsf) { + for (psf = pmc->sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } @@ -1866,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_lock(); in_dev = ip_mc_find_dev(net, imr); + if (!in_dev) { + ret = -ENODEV; + goto out; + } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; @@ -1883,19 +1965,18 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - if (in_dev) - ip_mc_dec_group(in_dev, group); + ip_mc_dec_group(in_dev, group); rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } - if (!in_dev) - ret = -ENODEV; +out: rtnl_unlock(); return ret; } +EXPORT_SYMBOL(ip_mc_leave_group); int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex) @@ -1956,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -1975,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); - for (j=i+1; j<psl->sl_count; j++) + for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; @@ -2001,7 +2082,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { - for (i=0; i<psl->sl_count; i++) + for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); @@ -2011,7 +2092,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2019,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } if (rv == 0) /* address already there is an error */ goto done; - for (j=psl->sl_count-1; j>=i; j--) + for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; @@ -2218,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } - for (i=0; i<copycount; i++) { + for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; @@ -2263,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) if (!psl) goto unlock; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } @@ -2312,18 +2393,31 @@ void ip_mc_drop_socket(struct sock *sk) int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) { struct ip_mc_list *im; + struct ip_mc_list __rcu **mc_hash; struct ip_sf_list *psf; int rv = 0; - for_each_pmc_rcu(in_dev, im) { - if (im->multiaddr == mc_addr) - break; + mc_hash = rcu_dereference(in_dev->mc_hash); + if (mc_hash) { + u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG); + + for (im = rcu_dereference(mc_hash[hash]); + im != NULL; + im = rcu_dereference(im->next_hash)) { + if (im->multiaddr == mc_addr) + break; + } + } else { + for_each_pmc_rcu(in_dev, im) { + if (im->multiaddr == mc_addr) + break; + } } if (im && proto == IPPROTO_IGMP) { rv = 1; } else if (im) { if (src_addr) { - for (psf=im->sources; psf; psf=psf->sf_next) { + for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } @@ -2435,6 +2529,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) struct ip_mc_list *im = (struct ip_mc_list *)v; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); char *querier; + long delta; + #ifdef CONFIG_IP_MULTICAST querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : IGMP_V2_SEEN(state->in_dev) ? "V2" : @@ -2448,11 +2544,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } + delta = im->timer.expires - jiffies; seq_printf(seq, "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, - im->tm_running, im->tm_running ? - jiffies_to_clock_t(im->timer.expires-jiffies) : 0, + im->tm_running, + im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, im->reporter); } return 0; @@ -2634,33 +2731,72 @@ static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops); + pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); if (!pde) goto out_igmp; - pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + pde = proc_create("mcfilter", S_IRUGO, net->proc_net, + &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; return 0; out_mcfilter: - proc_net_remove(net, "igmp"); + remove_proc_entry("igmp", net->proc_net); out_igmp: return -ENOMEM; } static void __net_exit igmp_net_exit(struct net *net) { - proc_net_remove(net, "mcfilter"); - proc_net_remove(net, "igmp"); + remove_proc_entry("mcfilter", net->proc_net); + remove_proc_entry("igmp", net->proc_net); } static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; +#endif -int __init igmp_mc_proc_init(void) +static int igmp_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) { - return register_pernet_subsys(&igmp_net_ops); + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct in_device *in_dev; + + switch (event) { + case NETDEV_RESEND_IGMP: + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) + ip_mc_rejoin_groups(in_dev); + break; + default: + break; + } + return NOTIFY_DONE; } + +static struct notifier_block igmp_notifier = { + .notifier_call = igmp_netdev_event, +}; + +int __init igmp_mc_init(void) +{ +#if defined(CONFIG_PROC_FS) + int err; + + err = register_pernet_subsys(&igmp_net_ops); + if (err) + return err; + err = register_netdevice_notifier(&igmp_notifier); + if (err) + goto reg_notif_fail; + return 0; + +reg_notif_fail: + unregister_pernet_subsys(&igmp_net_ops); + return err; +#else + return register_netdevice_notifier(&igmp_notifier); #endif +} diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f9ee7417f6a..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,27 +29,16 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -/* - * This struct holds the first and last local port number. - */ -struct local_ports sysctl_local_ports __read_mostly = { - .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), - .range = { 32768, 61000 }, -}; - -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - -void inet_get_local_port_range(int *low, int *high) +void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; do { - seq = read_seqbegin(&sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - *low = sysctl_local_ports.range[0]; - *high = sysctl_local_ports.range[1]; - } while (read_seqretry(&sysctl_local_ports.lock, seq)); + *low = net->ipv4.ip_local_ports.range[0]; + *high = net->ipv4.ip_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); @@ -57,8 +46,9 @@ int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) { struct sock *sk2; - struct hlist_node *node; int reuse = sk->sk_reuse; + int reuseport = sk->sk_reuseport; + kuid_t uid = sock_i_uid((struct sock *)sk); /* * Unlike other sk lookup places we do not check @@ -67,30 +57,32 @@ int inet_csk_bind_conflict(const struct sock *sk, * one this bucket belongs to. */ - sk_for_each_bound(sk2, node, &tb->owners) { + sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { - const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || - sk2_rcv_saddr == sk_rcv_saddr(sk)) + if ((!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + (!reuseport || !sk2->sk_reuseport || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, sock_i_uid(sk2))))) { + + if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || + sk2->sk_rcv_saddr == sk->sk_rcv_saddr) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { - const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || - sk2_rcv_saddr == sk_rcv_saddr(sk)) + if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || + sk2->sk_rcv_saddr == sk->sk_rcv_saddr) break; } } } - return node != NULL; + return sk2 != NULL; } EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); @@ -101,33 +93,36 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_bind_hashbucket *head; - struct hlist_node *node; struct inet_bind_bucket *tb; int ret, attempts = 5; struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; + kuid_t uid = sock_i_uid(sk); local_bh_disable(); if (!snum) { int remaining, rover, low, high; again: - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - smallest_rover = rover = net_random() % remaining + low; + smallest_rover = rover = prandom_u32() % remaining + low; smallest_size = -1; do { - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == rover) { - if (tb->fastreuse > 0 && - sk->sk_reuse && - sk->sk_state != TCP_LISTEN && + if (((tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + uid_eq(tb->fastuid, uid))) && (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; @@ -174,7 +169,7 @@ have_snum: head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } @@ -185,14 +180,18 @@ tb_found: if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN && + if (((tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size == -1) { goto success; } else { ret = 1; if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && + if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size != -1 && --attempts >= 0) { spin_unlock(&head->lock); goto again; @@ -212,9 +211,19 @@ tb_not_found: tb->fastreuse = 1; else tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; + if (sk->sk_reuseport) { + tb->fastreuseport = 1; + tb->fastuid = uid; + } else + tb->fastreuseport = 0; + } else { + if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; + if (tb->fastreuseport && + (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + tb->fastreuseport = 0; + } success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); @@ -283,7 +292,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct sock *newsk; + struct request_sock *req; int error; lock_sock(sk); @@ -296,7 +307,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) goto out_err; /* Find already established connection */ - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ @@ -308,14 +319,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - - newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - WARN_ON(newsk->sk_state == TCP_SYN_RECV); + req = reqsk_queue_remove(queue); + newsk = req->sk; + + sk_acceptq_removed(sk); + if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { + spin_lock_bh(&queue->fastopenq->lock); + if (tcp_rsk(req)->listener) { + /* We are still waiting for the final ACK from 3WHS + * so can't free req now. Instead, we set req->sk to + * NULL to signify that the child socket is taken + * so reqsk_fastopen_remove() will free the req + * when 3WHS finishes (or is aborted). + */ + req->sk = NULL; + req = NULL; + } + spin_unlock_bh(&queue->fastopenq->lock); + } out: release_sock(sk); + if (req) + __reqsk_free(req); return newsk; out_err: newsk = NULL; + req = NULL; *err = error; goto out; } @@ -374,18 +403,19 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); + int flags = inet_sk_flowi_flags(sk); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, - (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, - ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + flags, + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, + ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; @@ -403,28 +433,33 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *newinet = inet_sk(newsk); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; fl4 = &newinet->cork.fl.u.ip4; - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + + rcu_read_lock(); + opt = rcu_dereference(newinet->inet_opt); + flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), - (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, - ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, + ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -457,9 +492,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, prev = &req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); - if (ireq->rmt_port == rport && - ireq->rmt_addr == raddr && - ireq->loc_addr == laddr && + if (ireq->ir_rmt_port == rport && + ireq->ir_rmt_addr == raddr && + ireq->ir_loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { WARN_ON(req->sk); *prevp = prev; @@ -476,7 +511,8 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, + inet_rsk(req)->ir_rmt_port, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); @@ -495,20 +531,30 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, int *expire, int *resend) { if (!rskq_defer_accept) { - *expire = req->retrans >= thresh; + *expire = req->num_timeout >= thresh; *resend = 1; return; } - *expire = req->retrans >= thresh && - (!inet_rsk(req)->acked || req->retrans >= max_retries); + *expire = req->num_timeout >= thresh && + (!inet_rsk(req)->acked || req->num_timeout >= max_retries); /* * Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || - req->retrans >= rskq_defer_accept - 1; + req->num_timeout >= rskq_defer_accept - 1; +} + +int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +{ + int err = req->rsk_ops->rtx_syn_ack(parent, req); + + if (!err) + req->num_retrans++; + return err; } +EXPORT_SYMBOL(inet_rtx_syn_ack); void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long interval, @@ -573,13 +619,14 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, req->rsk_ops->syn_ack_timeout(parent, req); if (!expire && (!resend || - !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || + !inet_rtx_syn_ack(parent, req) || inet_rsk(req)->acked)) { unsigned long timeo; - if (req->retrans++ == 0) + if (req->num_timeout++ == 0) lopt->qlen_young--; - timeo = min((timeout << req->retrans), max_rto); + timeo = min(timeout << req->num_timeout, + max_rto); req->expires = now + timeo; reqp = &req->dl_next; continue; @@ -625,11 +672,13 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; - inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; - inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); - inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; + inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; + inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; + inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + newsk->sk_mark = inet_rsk(req)->ir_mark; + newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; @@ -673,6 +722,23 @@ void inet_csk_destroy_sock(struct sock *sk) } EXPORT_SYMBOL(inet_csk_destroy_sock); +/* This function allows to force a closure of a socket after the call to + * tcp/dccp_create_openreq_child(). + */ +void inet_csk_prepare_forced_close(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + /* sk_clone_lock locked the socket and set refcnt to 2 */ + bh_unlock_sock(sk); + sock_put(sk); + + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + percpu_counter_inc(sk->sk_prot->orphan_count); + inet_sk(sk)->inet_num = 0; +} +EXPORT_SYMBOL(inet_csk_prepare_forced_close); + int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); @@ -714,13 +780,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *acc_req; struct request_sock *req; inet_csk_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + acc_req = reqsk_queue_yank_acceptq(queue); /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -730,7 +797,7 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(&icsk->icsk_accept_queue); + reqsk_queue_destroy(queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; @@ -748,6 +815,19 @@ void inet_csk_listen_stop(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != tcp_rsk(req)->listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + sock_put(sk); + } inet_csk_destroy_sock(child); bh_unlock_sock(child); @@ -757,6 +837,17 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); __reqsk_free(req); } + if (queue->fastopenq != NULL) { + /* Free all the reqs queued in rskq_rst_head. */ + spin_lock_bh(&queue->fastopenq->lock); + acc_req = queue->fastopenq->rskq_rst_head; + queue->fastopenq->rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq->lock); + while ((req = acc_req) != NULL) { + acc_req = req->dl_next; + __reqsk_free(req); + } + } WARN_ON(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); @@ -799,3 +890,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); #endif + +static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + fl4 = &fl->u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_protocol, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + if (IS_ERR(rt)) + rt = NULL; + if (rt) + sk_setup_caps(sk, &rt->dst); + rcu_read_unlock(); + + return &rt->dst; +} + +struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + struct inet_sock *inet = inet_sk(sk); + + if (!dst) { + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); + if (!dst) + goto out; + } + dst->ops->update_pmtu(dst, sk, NULL, mtu); + + dst = __sk_dst_check(sk, 0); + if (!dst) + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +out: + return dst; +} +EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 46d1e7199a8..e34dccbc4d7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,11 +44,12 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */ + struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */ +#endif }; -#define INET_DIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) - static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) @@ -72,30 +73,29 @@ static inline void inet_diag_unlock_handler( int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct inet_diag_req_v2 *req, - u32 pid, u32 seq, u16 nlmsg_flags, + struct user_namespace *user_ns, + u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; + struct nlattr *attr; void *info = NULL; - struct inet_diag_meminfo *minfo = NULL; - unsigned char *b = skb_tail_pointer(skb); const struct inet_diag_handler *handler; int ext = req->idiag_ext; handler = inet_diag_table[req->sdiag_protocol]; BUG_ON(handler == NULL); - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = nlmsg_flags; + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; - r = NLMSG_DATA(nlh); + r = nlmsg_data(nlh); BUG_ON(sk->sk_state == TCP_TIME_WAIT); - if (ext & (1 << (INET_DIAG_MEMINFO - 1))) - minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); - r->idiag_family = sk->sk_family; r->idiag_state = sk->sk_state; r->idiag_timer = 0; @@ -106,39 +106,54 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = inet->inet_dport; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = inet->inet_rcv_saddr; r->id.idiag_dst[0] = inet->inet_daddr; + if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) + goto errout; + /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, * hence this needs to be included regardless of socket family. */ if (ext & (1 << (INET_DIAG_TOS - 1))) - RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); + if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) + goto errout; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { - const struct ipv6_pinfo *np = inet6_sk(sk); - *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = np->daddr; + *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + if (ext & (1 << (INET_DIAG_TCLASS - 1))) - RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); + if (nla_put_u8(skb, INET_DIAG_TCLASS, + inet6_sk(sk)->tclass) < 0) + goto errout; } #endif - r->idiag_uid = sock_i_uid(sk); + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); - if (minfo) { - minfo->idiag_rmem = sk_rmem_alloc_get(sk); - minfo->idiag_wmem = sk->sk_wmem_queued; - minfo->idiag_fmem = sk->sk_forward_alloc; - minfo->idiag_tmem = sk_wmem_alloc_get(sk); + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { + struct inet_diag_meminfo minfo = { + .idiag_rmem = sk_rmem_alloc_get(sk), + .idiag_wmem = sk->sk_wmem_queued, + .idiag_fmem = sk->sk_forward_alloc, + .idiag_tmem = sk_wmem_alloc_get(sk), + }; + + if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) + goto errout; } if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) - goto rtattr_failure; + goto errout; if (icsk == NULL) { handler->idiag_get_info(sk, r, NULL); @@ -147,7 +162,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, #define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) - if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS || + icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); @@ -165,16 +182,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) - info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info)); - - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { - const size_t len = strlen(icsk->icsk_ca_ops->name); + if (ext & (1 << (INET_DIAG_INFO - 1))) { + attr = nla_reserve(skb, INET_DIAG_INFO, + sizeof(struct tcp_info)); + if (!attr) + goto errout; - strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), - icsk->icsk_ca_ops->name); + info = nla_data(attr); } + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) + if (nla_put_string(skb, INET_DIAG_CONG, + icsk->icsk_ca_ops->name) < 0) + goto errout; + handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT && @@ -182,85 +203,89 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, icsk->icsk_ca_ops->get_info(sk, ext, skb); out: - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - return skb->len; + return nlmsg_end(skb, nlh); -rtattr_failure: -nlmsg_failure: - nlmsg_trim(skb, b); +errout: + nlmsg_cancel(skb, nlh); return -EMSGSIZE; } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_csk_diag_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_req_v2 *req, - u32 pid, u32 seq, u16 nlmsg_flags, + struct user_namespace *user_ns, + u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { return inet_sk_diag_fill(sk, inet_csk(sk), - skb, req, pid, seq, nlmsg_flags, unlh); + skb, req, user_ns, portid, seq, nlmsg_flags, unlh); } static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, struct sk_buff *skb, struct inet_diag_req_v2 *req, - u32 pid, u32 seq, u16 nlmsg_flags, + u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - long tmo; + s32 tmo; struct inet_diag_msg *r; - const unsigned char *previous_tail = skb_tail_pointer(skb); - struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, - unlh->nlmsg_type, sizeof(*r)); + struct nlmsghdr *nlh; - r = NLMSG_DATA(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; - nlh->nlmsg_flags = nlmsg_flags; + r = nlmsg_data(nlh); + BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_ttd - jiffies; + tmo = tw->tw_ttd - inet_tw_time_stamp(); if (tmo < 0) tmo = 0; r->idiag_family = tw->tw_family; r->idiag_retrans = 0; + r->id.idiag_if = tw->tw_bound_dev_if; sock_diag_save_cookie(tw, r->id.idiag_cookie); + r->id.idiag_sport = tw->tw_sport; r->id.idiag_dport = tw->tw_dport; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = tw->tw_rcv_saddr; r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); + r->idiag_expires = jiffies_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { - const struct inet6_timewait_sock *tw6 = - inet6_twsk((struct sock *)tw); - - *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; + *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr; } #endif - nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; - return skb->len; -nlmsg_failure: - nlmsg_trim(skb, previous_tail); - return -EMSGSIZE; + + return nlmsg_end(skb, nlh); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags, + struct inet_diag_req_v2 *r, + struct user_namespace *user_ns, + u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, - skb, r, pid, seq, nlmsg_flags, - unlh); - return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh); + return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, + nlmsg_flags, unlh); + + return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, + nlmsg_flags, unlh); } int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, @@ -269,16 +294,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s int err; struct sock *sk; struct sk_buff *rep; + struct net *net = sock_net(in_skb->sk); err = -EINVAL; if (req->sdiag_family == AF_INET) { - sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); } #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { - sk = inet6_lookup(&init_net, hashinfo, + sk = inet6_lookup(net, hashinfo, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, @@ -298,34 +324,32 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64)), - GFP_KERNEL); - if (!rep) + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + + sizeof(struct tcp_info) + 64, GFP_KERNEL); + if (!rep) { + err = -ENOMEM; goto out; + } err = sk_diag_fill(sk, rep, req, - NETLINK_CB(in_skb).pid, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { WARN_ON(err == -EMSGSIZE); - kfree_skb(rep); + nlmsg_free(rep); goto out; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); if (err > 0) err = 0; out: - if (sk) { - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put((struct inet_timewait_sock *)sk); - else - sock_put(sk); - } + if (sk) + sock_gen_put(sk); + out_nosk: return err; } @@ -419,25 +443,31 @@ static int inet_diag_bc_run(const struct nlattr *_bc, break; } - if (cond->prefix_len == 0) - break; - if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; + if (cond->family != AF_UNSPEC && + cond->family != entry->family) { + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, + cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; if (bitstring_match(addr, cond->addr, cond->prefix_len)) break; - if (entry->family == AF_INET6 && - cond->family == AF_INET) { - if (addr[0] == 0 && addr[1] == 0 && - addr[2] == htonl(0xffff) && - bitstring_match(addr + 3, cond->addr, - cond->prefix_len)) - break; - } yes = 0; break; } @@ -465,10 +495,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) if (entry.family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - entry.saddr = np->rcv_saddr.s6_addr32; - entry.daddr = np->daddr.s6_addr32; + entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32; + entry.daddr = sk->sk_v6_daddr.s6_addr32; } else #endif { @@ -500,6 +529,55 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* Validate an inet_diag_hostcond. */ +static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + int addr_len; + struct inet_diag_hostcond *cond; + + /* Check hostcond space. */ + *min_len += sizeof(struct inet_diag_hostcond); + if (len < *min_len) + return false; + cond = (struct inet_diag_hostcond *)(op + 1); + + /* Check address family and address length. */ + switch (cond->family) { + case AF_UNSPEC: + addr_len = 0; + break; + case AF_INET: + addr_len = sizeof(struct in_addr); + break; + case AF_INET6: + addr_len = sizeof(struct in6_addr); + break; + default: + return false; + } + *min_len += addr_len; + if (len < *min_len) + return false; + + /* Check prefix length (in bits) vs address length (in bytes). */ + if (cond->prefix_len > 8 * addr_len) + return false; + + return true; +} + +/* Validate a port comparison operator. */ +static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, + int len, int *min_len) +{ + /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ + *min_len += sizeof(struct inet_diag_bc_op); + if (len < *min_len) + return false; + return true; +} + static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { const void *bc = bytecode; @@ -507,29 +585,39 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) while (len > 0) { const struct inet_diag_bc_op *op = bc; + int min_len = sizeof(struct inet_diag_bc_op); //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { - case INET_DIAG_BC_AUTO: case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: + if (!valid_hostcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: - case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len + 4 || op->no & 3) - return -EINVAL; - if (op->no < len && - !valid_cc(bytecode, bytecode_len, len - op->no)) + if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; default: return -EINVAL; } - if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) + + if (op->code != INET_DIAG_BC_NOP) { + if (op->no < min_len || op->no > len + 4 || op->no & 3) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + } + + if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; len -= op->yes; @@ -547,26 +635,27 @@ static int inet_csk_diag_dump(struct sock *sk, return 0; return inet_csk_diag_fill(sk, skb, r, - NETLINK_CB(cb->skb).pid, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } -static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, +static int inet_twsk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, struct inet_diag_req_v2 *r, const struct nlattr *bc) { + struct inet_timewait_sock *tw = inet_twsk(sk); + if (bc != NULL) { struct inet_diag_entry entry; entry.family = tw->tw_family; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { - struct inet6_timewait_sock *tw6 = - inet6_twsk((struct sock *)tw); - entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32; - entry.daddr = tw6->tw_v6_daddr.s6_addr32; + entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32; + entry.daddr = tw->tw_v6_daddr.s6_addr32; } else #endif { @@ -582,29 +671,62 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, } return inet_twsk_diag_fill(tw, skb, r, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } +/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses + * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. + */ +static inline void inet_diag_req_addrs(const struct sock *sk, + const struct request_sock *req, + struct inet_diag_entry *entry) +{ + struct inet_request_sock *ireq = inet_rsk(req); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + if (req->rsk_ops->family == AF_INET6) { + entry->saddr = ireq->ir_v6_loc_addr.s6_addr32; + entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32; + } else if (req->rsk_ops->family == AF_INET) { + ipv6_addr_set_v4mapped(ireq->ir_loc_addr, + &entry->saddr_storage); + ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, + &entry->daddr_storage); + entry->saddr = entry->saddr_storage.s6_addr32; + entry->daddr = entry->daddr_storage.s6_addr32; + } + } else +#endif + { + entry->saddr = &ireq->ir_loc_addr; + entry->daddr = &ireq->ir_rmt_addr; + } +} + static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, u32 pid, u32 seq, + struct request_sock *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, const struct nlmsghdr *unlh) { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb_tail_pointer(skb); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = NLM_F_MULTI; - r = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + r = nlmsg_data(nlh); r->idiag_family = sk->sk_family; r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = req->retrans; + r->idiag_retrans = req->num_retrans; r->id.idiag_if = sk->sk_bound_dev_if; sock_diag_save_cookie(req, r->id.idiag_cookie); @@ -614,27 +736,29 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, tmo = 0; r->id.idiag_sport = inet->inet_sport; - r->id.idiag_dport = ireq->rmt_port; - r->id.idiag_src[0] = ireq->loc_addr; - r->id.idiag_dst[0] = ireq->rmt_addr; + r->id.idiag_dport = ireq->ir_rmt_port; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + + r->id.idiag_src[0] = ireq->ir_loc_addr; + r->id.idiag_dst[0] = ireq->ir_rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; - r->idiag_uid = sock_i_uid(sk); + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = 0; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = inet6_rsk(req)->loc_addr; - *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; + struct inet_diag_entry entry; + inet_diag_req_addrs(sk, req, &entry); + memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); + memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); } #endif - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - return skb->len; - -nlmsg_failure: - nlmsg_trim(skb, b); - return -1; + return nlmsg_end(skb, nlh); } static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, @@ -678,31 +802,21 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (reqnum < s_reqnum) continue; - if (r->id.idiag_dport != ireq->rmt_port && + if (r->id.idiag_dport != ireq->ir_rmt_port && r->id.idiag_dport) continue; if (bc) { - entry.saddr = -#if IS_ENABLED(CONFIG_IPV6) - (entry.family == AF_INET6) ? - inet6_rsk(req)->loc_addr.s6_addr32 : -#endif - &ireq->loc_addr; - entry.daddr = -#if IS_ENABLED(CONFIG_IPV6) - (entry.family == AF_INET6) ? - inet6_rsk(req)->rmt_addr.s6_addr32 : -#endif - &ireq->rmt_addr; - entry.dport = ntohs(ireq->rmt_port); + inet_diag_req_addrs(sk, req, &entry); + entry.dport = ntohs(ireq->ir_rmt_port); if (!inet_diag_bc_run(bc, &entry)) continue; } err = inet_diag_fill_req(skb, sk, req, - NETLINK_CB(cb->skb).pid, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { cb->args[3] = j + 1; @@ -725,6 +839,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, { int i, num; int s_i, s_num; + struct net *net = sock_net(skb->sk); s_i = cb->args[1]; s_num = num = cb->args[2]; @@ -744,6 +859,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, sk_nulls_for_each(sk, node, &ilb->head) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { num++; continue; @@ -803,8 +921,7 @@ skip_listen_ht: num = 0; - if (hlist_nulls_empty(&head->chain) && - hlist_nulls_empty(&head->twchain)) + if (hlist_nulls_empty(&head->chain)) continue; if (i > s_i) @@ -812,22 +929,31 @@ skip_listen_ht: spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { - struct inet_sock *inet = inet_sk(sk); + int res; + int state; + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next_normal; - if (!(r->idiag_states & (1 << sk->sk_state))) + state = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_substate : sk->sk_state; + if (!(r->idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) + sk->sk_family != r->sdiag_family) goto next_normal; - if (r->id.idiag_sport != inet->inet_sport && + if (r->id.idiag_sport != htons(sk->sk_num) && r->id.idiag_sport) goto next_normal; - if (r->id.idiag_dport != inet->inet_dport && + if (r->id.idiag_dport != sk->sk_dport && r->id.idiag_dport) goto next_normal; - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (sk->sk_state == TCP_TIME_WAIT) + res = inet_twsk_diag_dump(sk, skb, cb, r, bc); + else + res = inet_csk_diag_dump(sk, skb, cb, r, bc); + if (res < 0) { spin_unlock_bh(lock); goto done; } @@ -835,31 +961,6 @@ next_normal: ++num; } - if (r->idiag_states & TCPF_TIME_WAIT) { - struct inet_timewait_sock *tw; - - inet_twsk_for_each(tw, node, - &head->twchain) { - - if (num < s_num) - goto next_dying; - if (r->sdiag_family != AF_UNSPEC && - tw->tw_family != r->sdiag_family) - goto next_dying; - if (r->id.idiag_sport != tw->tw_sport && - r->id.idiag_sport) - goto next_dying; - if (r->id.idiag_dport != tw->tw_dport && - r->id.idiag_dport) - goto next_dying; - if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) { - spin_unlock_bh(lock); - goto done; - } -next_dying: - ++num; - } - } spin_unlock_bh(lock); } @@ -875,13 +976,16 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) { const struct inet_diag_handler *handler; + int err = 0; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r, bc); + else + err = PTR_ERR(handler); inet_diag_unlock_handler(handler); - return skb->len; + return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -892,7 +996,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_attrlen(cb->nlh, hdrlen)) bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc); + return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); } static inline int inet_diag_type2proto(int type) @@ -909,7 +1013,7 @@ static inline int inet_diag_type2proto(int type) static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { - struct inet_diag_req *rc = NLMSG_DATA(cb->nlh); + struct inet_diag_req *rc = nlmsg_data(cb->nlh); struct inet_diag_req_v2 req; struct nlattr *bc = NULL; int hdrlen = sizeof(struct inet_diag_req); @@ -929,7 +1033,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c static int inet_diag_get_exact_compat(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { - struct inet_diag_req *rc = NLMSG_DATA(nlh); + struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; req.sdiag_family = rc->idiag_family; @@ -944,6 +1048,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) { int hdrlen = sizeof(struct inet_diag_req); + struct net *net = sock_net(skb->sk); if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || nlmsg_len(nlh) < hdrlen) @@ -964,7 +1069,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) struct netlink_dump_control c = { .dump = inet_diag_dump_compat, }; - return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c); + return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } } @@ -974,6 +1079,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); + struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -992,11 +1098,11 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = inet_diag_dump, }; - return netlink_dump_start(sock_diag_nlsk, skb, h, &c); + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } - return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); + return inet_diag_get_exact(skb, h, nlmsg_data(h)); } static const struct sock_diag_handler inet_diag_handler = { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51b6d0..3b01959bf4b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,7 +21,30 @@ #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <net/sock.h> #include <net/inet_frag.h> +#include <net/inet_ecn.h> + +/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. + * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field + */ +const u8 ip_frag_ecn_table[16] = { + /* at least one fragment had CE, and others ECT_0 or ECT_1 */ + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + + /* invalid combinations : drop frame */ + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +}; +EXPORT_SYMBOL(ip_frag_ecn_table); static void inet_frag_secret_rebuild(unsigned long dummy) { @@ -29,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy) unsigned long now = jiffies; int i; + /* Per bucket lock NOT needed here, due to write lock protection */ write_lock(&f->lock); + get_random_bytes(&f->rnd, sizeof(u32)); for (i = 0; i < INETFRAGS_HASHSZ; i++) { + struct inet_frag_bucket *hb; struct inet_frag_queue *q; - struct hlist_node *p, *n; + struct hlist_node *n; - hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) { + hb = &f->hash[i]; + hlist_for_each_entry_safe(q, n, &hb->chain, list) { unsigned int hval = f->hashfn(q); if (hval != i) { + struct inet_frag_bucket *hb_dest; + hlist_del(&q->list); /* Relink to new hash chain. */ - hlist_add_head(&q->list, &f->hash[hval]); + hb_dest = &f->hash[hval]; + hlist_add_head(&q->list, &hb_dest->chain); } } } @@ -55,14 +85,14 @@ void inet_frags_init(struct inet_frags *f) { int i; - for (i = 0; i < INETFRAGS_HASHSZ; i++) - INIT_HLIST_HEAD(&f->hash[i]); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { + struct inet_frag_bucket *hb = &f->hash[i]; + spin_lock_init(&hb->chain_lock); + INIT_HLIST_HEAD(&hb->chain); + } rwlock_init(&f->lock); - f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ - (jiffies ^ (jiffies >> 6))); - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, (unsigned long)f); f->secret_timer.expires = jiffies + f->secret_interval; @@ -73,8 +103,9 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { nf->nqueues = 0; - atomic_set(&nf->mem, 0); + init_frag_mem_limit(nf); INIT_LIST_HEAD(&nf->lru_list); + spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); @@ -89,18 +120,28 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) nf->low_thresh = 0; local_bh_disable(); - inet_frag_evictor(nf, f); + inet_frag_evictor(nf, f, true); local_bh_enable(); + + percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) { - write_lock(&f->lock); + struct inet_frag_bucket *hb; + unsigned int hash; + + read_lock(&f->lock); + hash = f->hashfn(fq); + hb = &f->hash[hash]; + + spin_lock(&hb->chain_lock); hlist_del(&fq->list); - list_del(&fq->lru_list); - fq->net->nqueues--; - write_unlock(&f->lock); + spin_unlock(&hb->chain_lock); + + read_unlock(&f->lock); + inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -117,12 +158,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb, int *work) + struct sk_buff *skb) { - if (work) - *work -= skb->truesize; - - atomic_sub(skb->truesize, &nf->mem); if (f->skb_free) f->skb_free(skb); kfree_skb(skb); @@ -133,6 +170,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, { struct sk_buff *fp; struct netns_frags *nf; + unsigned int sum, sum_truesize = 0; WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -143,13 +181,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(nf, f, fp, work); + sum_truesize += fp->truesize; + frag_kfree_skb(nf, f, fp); fp = xp; } - + sum = sum_truesize + f->qsize; if (work) - *work -= f->qsize; - atomic_sub(f->qsize, &nf->mem); + *work -= sum; + sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); @@ -158,23 +197,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) { struct inet_frag_queue *q; int work, evicted = 0; - work = atomic_read(&nf->mem) - nf->low_thresh; - while (work > 0) { - read_lock(&f->lock); + if (!force) { + if (frag_mem_limit(nf) <= nf->high_thresh) + return 0; + } + + work = frag_mem_limit(nf) - nf->low_thresh; + while (work > 0 || force) { + spin_lock(&nf->lru_lock); + if (list_empty(&nf->lru_list)) { - read_unlock(&f->lock); + spin_unlock(&nf->lru_lock); break; } q = list_first_entry(&nf->lru_list, struct inet_frag_queue, lru_list); atomic_inc(&q->refcnt); - read_unlock(&f->lock); + /* Remove q from list to avoid several CPUs grabbing it */ + list_del_init(&q->lru_list); + + spin_unlock(&nf->lru_lock); spin_lock(&q->lock); if (!(q->last_in & INET_FRAG_COMPLETE)) @@ -194,28 +242,30 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { + struct inet_frag_bucket *hb; struct inet_frag_queue *qp; -#ifdef CONFIG_SMP - struct hlist_node *n; -#endif unsigned int hash; - write_lock(&f->lock); + read_lock(&f->lock); /* Protects against hash rebuild */ /* * While we stayed w/o the lock other CPU could update * the rnd seed, so we need to re-calculate the hash * chain. Fortunatelly the qp_in can be used to get one. */ hash = f->hashfn(qp_in); + hb = &f->hash[hash]; + spin_lock(&hb->chain_lock); + #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we - * promoted read lock to write lock. + * released the hash bucket lock. */ - hlist_for_each_entry(qp, n, &f->hash[hash], list) { + hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); - write_unlock(&f->lock); + spin_unlock(&hb->chain_lock); + read_unlock(&f->lock); qp_in->last_in |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; @@ -227,10 +277,11 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - hlist_add_head(&qp->list, &f->hash[hash]); - list_add_tail(&qp->lru_list, &nf->lru_list); - nf->nqueues++; - write_unlock(&f->lock); + hlist_add_head(&qp->list, &hb->chain); + inet_frag_lru_add(nf, qp); + spin_unlock(&hb->chain_lock); + read_unlock(&f->lock); + return qp; } @@ -243,12 +294,14 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (q == NULL) return NULL; + q->net = nf; f->constructor(q, arg); - atomic_add(f->qsize, &nf->mem); + add_frag_mem_limit(q, f->qsize); + setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - q->net = nf; + INIT_LIST_HEAD(&q->lru_list); return q; } @@ -269,18 +322,40 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) __releases(&f->lock) { + struct inet_frag_bucket *hb; struct inet_frag_queue *q; - struct hlist_node *n; + int depth = 0; + + hb = &f->hash[hash]; - hlist_for_each_entry(q, n, &f->hash[hash], list) { + spin_lock(&hb->chain_lock); + hlist_for_each_entry(q, &hb->chain, list) { if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); + spin_unlock(&hb->chain_lock); read_unlock(&f->lock); return q; } + depth++; } + spin_unlock(&hb->chain_lock); read_unlock(&f->lock); - return inet_frag_create(nf, f, key); + if (depth <= INETFRAGS_MAXDEPTH) + return inet_frag_create(nf, f, key); + else + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix) +{ + static const char msg[] = "inet_frag_find: Fragment hash bucket" + " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) + ". Dropping fragment.\n"; + + if (PTR_ERR(q) == -ENOBUFS) + LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7880af97020..43116e8c8e1 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,6 +24,31 @@ #include <net/secure_seq.h> #include <net/ip.h> +static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) +{ + static u32 inet_ehash_secret __read_mostly; + + net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); + + return __inet_ehashfn(laddr, lport, faddr, fport, + inet_ehash_secret + net_hash_mix(net)); +} + + +static unsigned int inet_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const __be32 laddr = inet->inet_rcv_saddr; + const __u16 lport = inet->inet_num; + const __be32 faddr = inet->inet_daddr; + const __be16 fport = inet->inet_dport; + struct net *net = sock_net(sk); + + return inet_ehashfn(net, laddr, lport, faddr, fport); +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -39,6 +64,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, write_pnet(&tb->ib_net, hold_net(net)); tb->port = snum; tb->fastreuse = 0; + tb->fastreuseport = 0; tb->num_owners = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); @@ -119,13 +145,12 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ - struct hlist_node *node; - inet_bind_bucket_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && tb->port == port) break; } - if (!node) { + if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, sock_net(sk), head, port); if (!tb) { @@ -151,16 +176,16 @@ static inline int compute_score(struct sock *sk, struct net *net, if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { __be32 rcv_saddr = inet->inet_rcv_saddr; - score = sk->sk_family == PF_INET ? 1 : 0; + score = sk->sk_family == PF_INET ? 2 : 1; if (rcv_saddr) { if (rcv_saddr != daddr) return -1; - score += 2; + score += 4; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) return -1; - score += 2; + score += 4; } } return score; @@ -176,6 +201,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) { @@ -183,17 +209,29 @@ struct sock *__inet_lookup_listener(struct net *net, struct hlist_nulls_node *node; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore; + int score, hiscore, matches = 0, reuseport = 0; + u32 phash = 0; rcu_read_lock(); begin: result = NULL; - hiscore = -1; + hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { result = sk; hiscore = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == hiscore && reuseport) { + matches++; + if (((u64)phash * matches) >> 32 == 0) + result = sk; + phash = next_pseudo_random32(phash); } } /* @@ -217,13 +255,26 @@ begin: } EXPORT_SYMBOL_GPL(__inet_lookup_listener); +/* All sockets share common refcount, but have different destructors */ +void sock_gen_put(struct sock *sk) +{ + if (!atomic_dec_and_test(&sk->sk_refcnt)) + return; + + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_free(inet_twsk(sk)); + else + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sock_gen_put); + struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif) { - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; @@ -237,16 +288,18 @@ struct sock *__inet_lookup_established(struct net *net, rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (INET_MATCH(sk, net, hash, acookie, - saddr, daddr, ports, dif)) { + if (sk->sk_hash != hash) + continue; + if (likely(INET_MATCH(sk, net, acookie, + saddr, daddr, ports, dif))) { if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) - goto begintw; - if (unlikely(!INET_MATCH(sk, net, hash, acookie, - saddr, daddr, ports, dif))) { - sock_put(sk); + goto out; + if (unlikely(!INET_MATCH(sk, net, acookie, + saddr, daddr, ports, dif))) { + sock_gen_put(sk); goto begin; } - goto out; + goto found; } } /* @@ -256,33 +309,9 @@ begin: */ if (get_nulls_value(node) != slot) goto begin; - -begintw: - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_nulls_for_each_rcu(sk, node, &head->twchain) { - if (INET_TW_MATCH(sk, net, hash, acookie, - saddr, daddr, ports, dif)) { - if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { - sk = NULL; - goto out; - } - if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, - saddr, daddr, ports, dif))) { - sock_put(sk); - goto begintw; - } - goto out; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begintw; - sk = NULL; out: + sk = NULL; +found: rcu_read_unlock(); return sk; } @@ -298,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, @@ -307,35 +336,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; - struct inet_timewait_sock *tw; + struct inet_timewait_sock *tw = NULL; int twrefcnt = 0; spin_lock(lock); - /* Check TIME-WAIT sockets first. */ - sk_nulls_for_each(sk2, node, &head->twchain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, net, hash, acookie, - saddr, daddr, ports, dif)) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ sk_nulls_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, net, hash, acookie, - saddr, daddr, ports, dif)) + if (sk2->sk_hash != hash) + continue; + + if (likely(INET_MATCH(sk2, net, acookie, + saddr, daddr, ports, dif))) { + if (sk2->sk_state == TCP_TIME_WAIT) { + tw = inet_twsk(sk2); + if (twsk_unique(sk, sk2, twp)) + break; + } goto not_unique; + } } -unique: /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ + * in hash table socket with a funny identity. + */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; @@ -444,7 +467,7 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - done =__sk_nulls_del_node_init_rcu(sk); + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -469,16 +492,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, int i, remaining, low, high, port; static u32 hint; u32 offset = hint + port_offset; - struct hlist_node *node; struct inet_timewait_sock *tw = NULL; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; @@ -488,10 +510,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, * because the established check is already * unique enough. */ - inet_bind_bucket_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), net) && tb->port == port) { - if (tb->fastreuse >= 0) + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); if (!check_established(death_row, sk, @@ -508,6 +531,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, break; } tb->fastreuse = -1; + tb->fastreuseport = -1; goto ok; next_port: diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index cc280a3f4f9..f17ea49b28f 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/if_vlan.h> #include <linux/inet_lro.h> +#include <net/checksum.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); @@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) *(p+2) = lro_desc->tcp_rcv_tsecr; } + csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); iph->tot_len = htons(lro_desc->ip_tot_len); - iph->check = 0; - iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); - tcph->check = 0; tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); @@ -231,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, lro_desc->last_skb = skb; } -static void lro_add_frags(struct net_lro_desc *lro_desc, - int len, int hlen, int truesize, - struct skb_frag_struct *skb_frags, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *skb = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb->truesize += truesize; - - skb_frags[0].page_offset += hlen; - skb_frag_size_sub(&skb_frags[0], hlen); - - while (tcp_data_len > 0) { - *(lro_desc->next_frag) = *skb_frags; - tcp_data_len -= skb_frag_size(skb_frags); - lro_desc->next_frag++; - skb_frags++; - skb_shinfo(skb)->nr_frags++; - } -} static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, @@ -372,128 +348,6 @@ out: return 1; } - -static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *mac_hdr, - int hlen, __wsum sum, - u32 ip_summed) -{ - struct sk_buff *skb; - struct skb_frag_struct *skb_frags; - int data_len = len; - int hdr_len = min(len, hlen); - - skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); - if (!skb) - return NULL; - - skb_reserve(skb, lro_mgr->frag_align_pad); - skb->len = len; - skb->data_len = len - hdr_len; - skb->truesize += true_size; - skb->tail += hdr_len; - - memcpy(skb->data, mac_hdr, hdr_len); - - skb_frags = skb_shinfo(skb)->frags; - while (data_len > 0) { - *skb_frags = *frags; - data_len -= skb_frag_size(frags); - skb_frags++; - frags++; - skb_shinfo(skb)->nr_frags++; - } - - skb_shinfo(skb)->frags[0].page_offset += hdr_len; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); - - skb->ip_summed = ip_summed; - skb->csum = sum; - skb->protocol = eth_type_trans(skb, lro_mgr->dev); - return skb; -} - -static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *priv, __wsum sum) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - struct sk_buff *skb; - u64 flags; - void *mac_hdr; - int mac_hdr_len; - int hdr_len = LRO_MAX_PG_HLEN; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_frag_header || - lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { - mac_hdr = skb_frag_address(frags); - goto out1; - } - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out1; - - hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); - mac_hdr_len = (int)((void *)(iph) - mac_hdr); - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out1; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) - goto out1; - - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, 0, lro_mgr->ip_summed_aggr); - if (!skb) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - iph = (void *)(skb->data + vlan_hdr_len); - tcph = (void *)((u8 *)skb->data + vlan_hdr_len - + IP_HDR_LEN(iph)); - - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return NULL; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) - goto out2; - - lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return NULL; - -out2: /* send aggregated packets to the stack */ - lro_flush(lro_mgr, lro_desc); - -out1: /* Original packet has to be posted to the stack */ - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, sum, lro_mgr->ip_summed); -out: - return skb; -} - void lro_receive_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, void *priv) @@ -507,23 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr, } EXPORT_SYMBOL(lro_receive_skb); -void lro_receive_frags(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, void *priv, __wsum sum) -{ - struct sk_buff *skb; - - skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum); - if (!skb) - return; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); -} -EXPORT_SYMBOL(lro_receive_frags); - void lro_flush_all(struct net_lro_mgr *lro_mgr) { int i; @@ -535,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr) } } EXPORT_SYMBOL(lro_flush_all); - -void lro_flush_pkt(struct net_lro_mgr *lro_mgr, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (lro_desc->active) - lro_flush(lro_mgr, lro_desc); -} -EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2784db3155f..6d592f8555f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -87,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, refcnt += inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); -#ifdef SOCK_REFCNT_DEBUG - if (atomic_read(&tw->tw_refcnt) != 1) { - pr_debug("%s timewait_sock %p refcnt=%d\n", - tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); - } -#endif - while (refcnt) { - inet_twsk_put(tw); - refcnt--; - } + BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); + atomic_sub(refcnt, &tw->tw_refcnt); } -static noinline void inet_twsk_free(struct inet_timewait_sock *tw) +void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); @@ -118,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(inet_twsk_put); +static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, + struct hlist_nulls_head *list) +{ + hlist_nulls_add_head_rcu(&tw->tw_node, list); +} + +static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind_node, list); +} + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -146,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); /* - * Step 2: Hash TW into TIMEWAIT chain. - * Should be done before removing sk from established chain - * because readers are lockless and search established first. + * Step 2: Hash TW into tcp ehash chain. + * Notes : + * - tw_refcnt is set to 3 because : + * - We have one reference from bhash chain. + * - We have one reference from ehash chain. + * We can use atomic_set() because prior spin_lock()/spin_unlock() + * committed into memory all tw fields. */ - inet_twsk_add_node_rcu(tw, &ehead->twchain); + atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + inet_twsk_add_node_rcu(tw, &ehead->chain); - /* Step 3: Remove SK from established hash. */ + /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - /* - * Notes : - * - We initially set tw_refcnt to 0 in inet_twsk_alloc() - * - We add one reference for the bhash link - * - We add one reference for the ehash link - * - We want this refcnt update done before allowing other - * threads to find this tw in ehash chain. - */ - atomic_add(1 + 1 + 1, &tw->tw_refcnt); - spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -216,7 +215,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, const int slot) { struct inet_timewait_sock *tw; - struct hlist_node *node; unsigned int killed; int ret; @@ -229,7 +227,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, killed = 0; ret = 0; rescan: - inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); @@ -388,11 +386,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, if (slot >= INET_TWDR_TWKILL_SLOTS) slot = INET_TWDR_TWKILL_SLOTS - 1; } - tw->tw_ttd = jiffies + timeo; + tw->tw_ttd = inet_tw_time_stamp() + timeo; slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); list = &twdr->cells[slot]; } else { - tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK); if (twdr->twcal_hand < 0) { twdr->twcal_hand = 0; @@ -438,10 +436,10 @@ void inet_twdr_twcal_tick(unsigned long data) for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { if (time_before_eq(j, now)) { - struct hlist_node *node, *safe; + struct hlist_node *safe; struct inet_timewait_sock *tw; - inet_twsk_for_each_inmate_safe(tw, node, safe, + inet_twsk_for_each_inmate_safe(tw, safe, &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); @@ -491,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, restart_rcu: rcu_read_lock(); restart: - sk_nulls_for_each_rcu(sk, node, &head->twchain) { + sk_nulls_for_each_rcu(sk, node, &head->chain) { + if (sk->sk_state != TCP_TIME_WAIT) + continue; tw = inet_twsk(sk); if ((tw->tw_family != family) || atomic_read(&twsk_net(tw)->count)) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index dfba343b250..bd5f5928167 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -26,20 +26,7 @@ * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. - * At this moment this information consists only of ID field for the next - * outgoing IP packet. This field is incremented with each packet as encoded - * in inet_getid() function (include/net/inetpeer.h). - * At the moment of writing this notes identifier of IP packets is generated - * to be unpredictable using this code only for packets subjected - * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size uses a constant ID and do not use this code (see - * ip_select_ident() in include/net/ip.h). * - * Route cache entries hold references to our nodes. - * New cache entries get references via lookup by destination IP address in - * the avl tree. The reference is grabbed only when it's needed i.e. only - * when we try to output IP packet which needs an unpredictable ID (see - * __ip_select_ident() in net/ipv4/route.c). * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can @@ -62,7 +49,6 @@ * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable - * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; @@ -82,23 +68,32 @@ static const struct inet_peer peer_fake_node = { .avl_height = 0 }; -struct inet_peer_base { - struct inet_peer __rcu *root; - seqlock_t lock; - int total; -}; +void inet_peer_base_init(struct inet_peer_base *bp) +{ + bp->root = peer_avl_empty_rcu; + seqlock_init(&bp->lock); + bp->flush_seq = ~0U; + bp->total = 0; +} +EXPORT_SYMBOL_GPL(inet_peer_base_init); -static struct inet_peer_base v4_peers = { - .root = peer_avl_empty_rcu, - .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), - .total = 0, -}; +static atomic_t v4_seq = ATOMIC_INIT(0); +static atomic_t v6_seq = ATOMIC_INIT(0); -static struct inet_peer_base v6_peers = { - .root = peer_avl_empty_rcu, - .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), - .total = 0, -}; +static atomic_t *inetpeer_seq_ptr(int family) +{ + return (family == AF_INET ? &v4_seq : &v6_seq); +} + +static inline void flush_check(struct inet_peer_base *base, int family) +{ + atomic_t *fp = inetpeer_seq_ptr(family); + + if (unlikely(base->flush_seq != atomic_read(fp))) { + inetpeer_invalidate_tree(base); + base->flush_seq = atomic_read(fp); + } +} #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ @@ -110,8 +105,8 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min static void inetpeer_gc_worker(struct work_struct *work) { - struct inet_peer *p, *n; - LIST_HEAD(list); + struct inet_peer *p, *n, *c; + struct list_head list; spin_lock_bh(&gc_lock); list_replace_init(&gc_list, &list); @@ -122,17 +117,19 @@ static void inetpeer_gc_worker(struct work_struct *work) list_for_each_entry_safe(p, n, &list, gc_list) { - if(need_resched()) + if (need_resched()) cond_resched(); - if (p->avl_left != peer_avl_empty) { - list_add_tail(&p->avl_left->gc_list, &list); - p->avl_left = peer_avl_empty; + c = rcu_dereference_protected(p->avl_left, 1); + if (c != peer_avl_empty) { + list_add_tail(&c->gc_list, &list); + p->avl_left = peer_avl_empty_rcu; } - if (p->avl_right != peer_avl_empty) { - list_add_tail(&p->avl_right->gc_list, &list); - p->avl_right = peer_avl_empty; + c = rcu_dereference_protected(p->avl_right, 1); + if (c != peer_avl_empty) { + list_add_tail(&c->gc_list, &list); + p->avl_right = peer_avl_empty_rcu; } n = list_entry(p->gc_list.next, struct inet_peer, gc_list); @@ -176,7 +173,7 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker); + INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } static int addr_compare(const struct inetpeer_addr *a, @@ -209,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a, stackptr = _stack; \ *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty; ) { \ + u != peer_avl_empty;) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ @@ -264,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu; ) { \ + u->avl_right != peer_avl_empty_rcu;) { \ v = &u->avl_right; \ *stackptr++ = v; \ u = rcu_deref_locked(*v, base); \ @@ -401,11 +398,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, call_rcu(&p->rcu, inetpeer_free_rcu); } -static struct inet_peer_base *family_to_base(int family) -{ - return family == AF_INET ? &v4_peers : &v6_peers; -} - /* perform garbage collect on all items stacked during a lookup */ static int inet_peer_gc(struct inet_peer_base *base, struct inet_peer __rcu **stack[PEER_MAXDEPTH], @@ -443,14 +435,17 @@ static int inet_peer_gc(struct inet_peer_base *base, return cnt; } -struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) +struct inet_peer *inet_getpeer(struct inet_peer_base *base, + const struct inetpeer_addr *daddr, + int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; unsigned int sequence; int invalidated, gccnt = 0; + flush_check(base, daddr->family); + /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ @@ -488,17 +483,12 @@ relookup: p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, - (daddr->family == AF_INET) ? - secure_ip_id(daddr->addr.a4) : - secure_ipv6_id(daddr->addr.a6)); - p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; - p->rate_last = 0; - p->pmtu_expires = 0; - p->pmtu_orig = 0; - memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); + /* 60*HZ is arbitrary, but chosen enough high so that the first + * calculation of tokens is at its maximum. + */ + p->rate_last = jiffies - 60*HZ; INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ @@ -514,7 +504,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); @@ -571,26 +561,19 @@ static void inetpeer_inval_rcu(struct rcu_head *head) schedule_delayed_work(&gc_work, gc_delay); } -void inetpeer_invalidate_tree(int family) +void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *old, *new, *prev; - struct inet_peer_base *base = family_to_base(family); + struct inet_peer *root; write_seqlock_bh(&base->lock); - old = base->root; - if (old == peer_avl_empty_rcu) - goto out; - - new = peer_avl_empty_rcu; - - prev = cmpxchg(&base->root, old, new); - if (prev == old) { + root = rcu_deref_locked(base->root, base); + if (root != peer_avl_empty) { + base->root = peer_avl_empty_rcu; base->total = 0; - call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); + call_rcu(&root->gc_rcu, inetpeer_inval_rcu); } -out: write_sequnlock_bh(&base->lock); } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index ab09b126423..3a83ce5efa8 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,24 @@ #include <net/route.h> #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + skb->ignore_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -54,10 +72,15 @@ static int ip_forward_finish(struct sk_buff *skb) int ip_forward(struct sk_buff *skb) { + u32 mtu; struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + /* that should never happen */ + if (skb->pkt_type != PACKET_HOST) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -67,9 +90,6 @@ int ip_forward(struct sk_buff *skb) if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; - if (skb->pkt_type != PACKET_HOST) - goto drop; - skb_forward_csum(skb); /* @@ -85,14 +105,15 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IPCB(skb)->flags |= IPSKB_FORWARDED; + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->dst))); + htonl(mtu)); goto drop; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9dbd3dd6022..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -79,40 +79,11 @@ struct ipq { struct inet_peer *peer; }; -/* RFC 3168 support : - * We want to check ECN values of all fragments, do detect invalid combinations. - * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. - */ -#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ -#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ -#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ -#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ - static inline u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } -/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements - * Value : 0xff if frame should be dropped. - * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field - */ -static const u8 ip4_frag_ecn_table[16] = { - /* at least one fragment had CE, and others ECT_0 or ECT_1 */ - [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, - [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, - [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, - - /* invalid combinations : drop frame */ - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, -}; - static struct inet_frags ip4_frags; int ip_frag_nqueues(struct net *net) @@ -122,7 +93,7 @@ int ip_frag_nqueues(struct net *net) int ip_frag_mem(struct net *net) { - return atomic_read(&net->ipv4.frags.mem); + return sum_frag_mem_limit(&net->ipv4.frags); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -135,6 +106,7 @@ struct ip4_create_arg { static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { + net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); @@ -161,16 +133,13 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); + struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, + frags); + struct net *net = container_of(ipv4, struct net, ipv4); + struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; @@ -180,7 +149,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) @@ -215,7 +184,7 @@ static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); if (evicted) IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } @@ -251,8 +220,7 @@ static void ip_expire(unsigned long arg) if (!head->dev) goto out_rcu_unlock; - /* skb dst is stale, drop it, and perform route lookup again */ - skb_dst_drop(head); + /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); @@ -264,8 +232,9 @@ static void ip_expire(unsigned long arg) * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || - (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL)) + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; @@ -295,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ @@ -336,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; + unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); @@ -345,9 +312,12 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp); + + sum_truesize += fp->truesize; + kfree_skb(fp); fp = xp; } while (fp); + sub_frag_mem_limit(&qp->q, sum_truesize); qp->q.last_in = 0; qp->q.len = 0; @@ -492,7 +462,8 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it); + sub_frag_mem_limit(&qp->q, free_it->truesize); + kfree_skb(free_it); } } @@ -515,17 +486,26 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - atomic_add(skb->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len + ihl > qp->q.max_size) + qp->q.max_size = skb->len + ihl; + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) - return ip_frag_reasm(qp, prev, dev); + qp->q.meat == qp->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } - write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); - write_unlock(&ip4_frags.lock); + skb_dst_drop(skb); + inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -550,7 +530,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, ipq_kill(qp); - ecn = ip4_frag_ecn_table[qp->ecn]; + ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; @@ -586,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split @@ -609,7 +589,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, clone->truesize); } skb_push(head, head->data - skb_network_header(head)); @@ -637,14 +617,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &qp->q.net->mem); + sub_frag_mem_limit(&qp->q, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; + IPCB(head)->frag_max_size = qp->q.max_size; iph = ip_hdr(head); - iph->frag_off = 0; + /* max_size != 0 implies at least one fragment had IP_DF set */ + iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); @@ -674,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ - if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) - ip_evictor(net); + ip_evictor(net); /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { @@ -698,34 +679,33 @@ EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) { - const struct iphdr *iph; + struct iphdr iph; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) + if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) return skb; - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return skb; - if (!pskb_may_pull(skb, iph->ihl*4)) + if (iph.ihl < 5 || iph.version != 4) return skb; - iph = ip_hdr(skb); - len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl * 4)) + + len = ntohs(iph.tot_len); + if (skb->len < len || len < (iph.ihl * 4)) return skb; - if (ip_is_fragment(ip_hdr(skb))) { + if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { + if (!pskb_may_pull(skb, iph.ihl*4)) + return skb; if (pskb_trim_rcsum(skb, len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) return NULL; - skb->rxhash = 0; + skb_clear_hash(skb); } } return skb; @@ -793,6 +773,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[2].data = &net->ipv4.frags.timeout; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } hdr = register_net_sysctl(net, "net/ipv4", table); @@ -839,14 +823,22 @@ static inline void ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. + /* Fragment cache limits. + * + * The fragment memory accounting code, (tries to) account for + * the real memory usage, by measuring both the size of frag + * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) + * and the SKB's truesize. + * + * A 64K fragment consumes 129736 bytes (44*2944)+200 + * (1500 truesize == 2944, sizeof(struct ipq) == 200) + * + * We will commit 4MB at one time. Should we cross that limit + * we will prune down to 3MB, making room for approx 8 big 64K + * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 256 * 1024; - net->ipv4.frags.low_thresh = 192 * 1024; + net->ipv4.frags.high_thresh = 4 * 1024 * 1024; + net->ipv4.frags.low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f49047b7960..9b842544aea 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -37,7 +37,7 @@ #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -108,417 +108,54 @@ fatal route to network, even if it were you who configured fatal static route: you are innocent. :-) - - - 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain - practically identical code. It would be good to glue them - together, but it is not very evident, how to make them modular. - sit is integral part of IPv6, ipip and gre are naturally modular. - We could extract common parts (hash table, ioctl etc) - to a separate module (ip_tunnel.c). - Alexey Kuznetsov. */ +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); + static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); -static void ipgre_tunnel_setup(struct net_device *dev); -static int ipgre_tunnel_bind_dev(struct net_device *dev); - -/* Fallback tunnel: no source, no destination, no key, no options */ - -#define HASH_SIZE 16 static int ipgre_net_id __read_mostly; -struct ipgre_net { - struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; - - struct net_device *fb_tunnel_dev; -}; - -/* Tunnel hash table */ - -/* - 4 hash tables: - - 3: (remote,local) - 2: (remote,*) - 1: (*,local) - 0: (*,*) - - We require exact key match i.e. if a key is present in packet - it will match only tunnel with the same key; if it is not present, - it will match only keyless tunnel. - - All keysless packets, if not matched configured keyless tunnels - will match fallback tunnel. - */ - -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) - -#define tunnels_r_l tunnels[3] -#define tunnels_r tunnels[2] -#define tunnels_l tunnels[1] -#define tunnels_wc tunnels[0] -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -#define for_each_ip_tunnel_rcu(start) \ - for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - -/* often modified stats are per cpu, other are shared (netdev->stats) */ -struct pcpu_tstats { - u64 rx_packets; - u64 rx_bytes; - u64 tx_packets; - u64 tx_bytes; - struct u64_stats_sync syncp; -}; - -static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - - tot->multicast = dev->stats.multicast; - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_errors = dev->stats.rx_errors; - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - return tot; -} - -/* Given src, dst and key, find appropriate for input tunnel. */ - -static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, - __be32 remote, __be32 local, - __be32 key, __be16 gre_proto) -{ - struct net *net = dev_net(dev); - int link = dev->ifindex; - unsigned int h0 = HASH(remote); - unsigned int h1 = HASH(key); - struct ip_tunnel *t, *cand = NULL; - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? - ARPHRD_ETHER : ARPHRD_IPGRE; - int score, cand_score = 4; - - for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { - if (local != t->parms.iph.saddr || - remote != t->parms.iph.daddr || - key != t->parms.i_key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { - if (remote != t->parms.iph.daddr || - key != t->parms.i_key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { - if ((local != t->parms.iph.saddr && - (local != t->parms.iph.daddr || - !ipv4_is_multicast(local))) || - key != t->parms.i_key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { - if (t->parms.i_key != key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - if (cand != NULL) - return cand; - - dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) - return netdev_priv(dev); - - return NULL; -} - -static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, - struct ip_tunnel_parm *parms) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - __be32 key = parms->i_key; - unsigned int h = HASH(key); - int prio = 0; - - if (local) - prio |= 1; - if (remote && !ipv4_is_multicast(remote)) { - prio |= 2; - h ^= HASH(remote); - } - - return &ign->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, - struct ip_tunnel *t) -{ - return __ipgre_bucket(ign, &t->parms); -} - -static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); - - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} - -static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp; - struct ip_tunnel *iter; - - for (tp = ipgre_bucket(ign, t); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; - } - } -} - -static struct ip_tunnel *ipgre_tunnel_find(struct net *net, - struct ip_tunnel_parm *parms, - int type) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - __be32 key = parms->i_key; - int link = parms->link; - struct ip_tunnel *t; - struct ip_tunnel __rcu **tp; - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - for (tp = __ipgre_bucket(ign, parms); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) - if (local == t->parms.iph.saddr && - remote == t->parms.iph.daddr && - key == t->parms.i_key && - link == t->parms.link && - type == t->dev->type) - break; - - return t; -} - -static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) -{ - struct ip_tunnel *t, *nt; - struct net_device *dev; - char name[IFNAMSIZ]; - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); - if (t || !create) - return t; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else - strcpy(name, "gre%d"); - - dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); - if (!dev) - return NULL; - - dev_net_set(dev, net); - - nt = netdev_priv(dev); - nt->parms = *parms; - dev->rtnl_link_ops = &ipgre_link_ops; - - dev->mtu = ipgre_tunnel_bind_dev(dev); - - if (register_netdevice(dev) < 0) - goto failed_free; - - /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) - dev->features |= NETIF_F_LLTX; - - dev_hold(dev); - ipgre_tunnel_link(ign, nt); - return nt; - -failed_free: - free_netdev(dev); - return NULL; -} - -static void ipgre_tunnel_uninit(struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - ipgre_tunnel_unlink(ign, netdev_priv(dev)); - dev_put(dev); -} - +static int gre_tap_net_id __read_mostly; -static void ipgre_err(struct sk_buff *skb, u32 info) +static int ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { -/* All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. + /* All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. - Moreover, Cisco "wise men" put GRE key to the third word - in GRE header. It makes impossible maintaining even soft state for keyed - GRE tunnels with enabled checksum. Tell them "thank you". + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft + state for keyed GRE tunnels with enabled checksum. Tell + them "thank you". - Well, I wonder, rfc1812 was written by Cisco employee, - what the hell these idiots break standards established - by themselves??? - */ - - const struct iphdr *iph = (const struct iphdr *)skb->data; - __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); - int grehlen = (iph->ihl<<2) + 4; + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standards established + by themselves??? + */ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; + const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; - __be16 flags; - - flags = p[0]; - if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { - if (flags&(GRE_VERSION|GRE_ROUTING)) - return; - if (flags&GRE_KEY) { - grehlen += 4; - if (flags&GRE_CSUM) - grehlen += 4; - } - } - - /* If only 8 bytes returned, keyed message will be dropped here */ - if (skb_headlen(skb) < grehlen) - return; switch (type) { default: case ICMP_PARAMETERPROB: - return; + return PACKET_RCVD; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return; - case ICMP_FRAG_NEEDED: - /* Soft state for pmtu is maintained by IP core. */ - return; + return PACKET_RCVD; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -529,629 +166,173 @@ static void ipgre_err(struct sk_buff *skb, u32 info) break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return; + return PACKET_RCVD; + break; + + case ICMP_REDIRECT: break; } - rcu_read_lock(); - t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, - flags & GRE_KEY ? - *(((__be32 *)p) + (grehlen / 4) - 1) : 0, - p[1]); - if (t == NULL || t->parms.iph.daddr == 0 || + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); + + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); + + if (t == NULL) + return PACKET_REJECT; + + if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - goto out; + return PACKET_RCVD; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - goto out; + return PACKET_RCVD; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; -out: - rcu_read_unlock(); -} - -static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) -{ - if (INET_ECN_is_ce(iph->tos)) { - if (skb->protocol == htons(ETH_P_IP)) { - IP_ECN_set_ce(ip_hdr(skb)); - } else if (skb->protocol == htons(ETH_P_IPV6)) { - IP6_ECN_set_ce(ipv6_hdr(skb)); - } - } -} - -static inline u8 -ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) -{ - u8 inner = 0; - if (skb->protocol == htons(ETH_P_IP)) - inner = old_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); - return INET_ECN_encapsulate(tos, inner); + return PACKET_RCVD; } -static int ipgre_rcv(struct sk_buff *skb) +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; const struct iphdr *iph; - u8 *h; - __be16 flags; - __sum16 csum = 0; - __be32 key = 0; - u32 seqno = 0; struct ip_tunnel *tunnel; - int offset = 4; - __be16 gre_proto; - if (!pskb_may_pull(skb, 16)) - goto drop_nolock; + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); iph = ip_hdr(skb); - h = skb->data; - flags = *(__be16 *)h; - - if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { - /* - Version must be 0. - - We do not support routing headers. - */ - if (flags&(GRE_VERSION|GRE_ROUTING)) - goto drop_nolock; - - if (flags&GRE_CSUM) { - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - if (!csum) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - } - offset += 4; - } - if (flags&GRE_KEY) { - key = *(__be32 *)(h + offset); - offset += 4; - } - if (flags&GRE_SEQ) { - seqno = ntohl(*(__be32 *)(h + offset)); - offset += 4; - } - } - - gre_proto = *(__be16 *)(h + 2); - - rcu_read_lock(); - if ((tunnel = ipgre_tunnel_lookup(skb->dev, - iph->saddr, iph->daddr, key, - gre_proto))) { - struct pcpu_tstats *tstats; - - secpath_reset(skb); - - skb->protocol = gre_proto; - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { - skb->protocol = htons(ETH_P_IP); - if ((*(h + offset) & 0xF0) != 0x40) - offset += 4; - } - - skb->mac_header = skb->network_header; - __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb_transport_header(skb), offset); - skb->pkt_type = PACKET_HOST; -#ifdef CONFIG_NET_IPGRE_BROADCAST - if (ipv4_is_multicast(iph->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; - tunnel->dev->stats.multicast++; - skb->pkt_type = PACKET_BROADCAST; - } -#endif - - if (((flags&GRE_CSUM) && csum) || - (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - if (tunnel->parms.i_flags&GRE_SEQ) { - if (!(flags&GRE_SEQ) || - (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - tunnel->i_seqno = seqno + 1; - } - - /* Warning: All skb pointers will be invalidated! */ - if (tunnel->dev->type == ARPHRD_ETHER) { - if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - - iph = ip_hdr(skb); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->saddr, iph->daddr, tpi->key); - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - __skb_tunnel_rx(skb, tunnel->dev); - - skb_reset_network_header(skb); - ipgre_ecn_decapsulate(iph, skb); - - netif_rx(skb); - - rcu_read_unlock(); - return 0; + if (tunnel) { + skb_pop_mac_header(skb); + ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); + return PACKET_RCVD; } - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - -drop: - rcu_read_unlock(); -drop_nolock: - kfree_skb(skb); - return 0; + return PACKET_REJECT; } -static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, + const struct iphdr *tnl_params, + __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct pcpu_tstats *tstats; - const struct iphdr *old_iph = ip_hdr(skb); - const struct iphdr *tiph; - struct flowi4 fl4; - u8 tos; - __be16 df; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int gre_hlen; - __be32 dst; - int mtu; - - if (dev->type == ARPHRD_ETHER) - IPCB(skb)->flags = 0; - - if (dev->header_ops && dev->type == ARPHRD_IPGRE) { - gre_hlen = 0; - tiph = (const struct iphdr *)skb->data; - } else { - gre_hlen = tunnel->hlen; - tiph = &tunnel->parms.iph; - } - - if ((dst = tiph->daddr) == 0) { - /* NBMA tunnel */ - - if (skb_dst(skb) == NULL) { - dev->stats.tx_fifo_errors++; - goto tx_error; - } - - if (skb->protocol == htons(ETH_P_IP)) { - rt = skb_rtable(skb); - dst = rt->rt_gateway; - } -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { - const struct in6_addr *addr6; - struct neighbour *neigh; - bool do_tx_error_icmp; - int addr_type; - - neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); - if (neigh == NULL) - goto tx_error; - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) - do_tx_error_icmp = true; - else { - do_tx_error_icmp = false; - dst = addr6->s6_addr32[3]; - } - neigh_release(neigh); - if (do_tx_error_icmp) - goto tx_error_icmp; - } -#endif - else - goto tx_error; - } + struct tnl_ptk_info tpi; - tos = tiph->tos; - if (tos == 1) { - tos = 0; - if (skb->protocol == htons(ETH_P_IP)) - tos = old_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); - } + tpi.flags = tunnel->parms.o_flags; + tpi.proto = proto; + tpi.key = tunnel->parms.o_key; + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; + tpi.seq = htonl(tunnel->o_seqno); - rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, - tunnel->parms.o_key, RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; - } - tdev = rt->dst.dev; + /* Push GRE header. */ + gre_build_header(skb, &tpi, tunnel->hlen); - if (tdev == dev) { - ip_rt_put(rt); - dev->stats.collisions++; - goto tx_error; - } - - df = tiph->frag_off; - if (df) - mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; - else - mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); +} - if (skb->protocol == htons(ETH_P_IP)) { - df |= (old_iph->frag_off&htons(IP_DF)); +static netdev_tx_t ipgre_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *tnl_params; - if ((old_iph->frag_off&htons(IP_DF)) && - mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - goto tx_error; - } - } -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { - struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); - - if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && - !ipv4_is_multicast(tunnel->parms.iph.daddr)) || - rt6->rt6i_dst.plen == 128) { - rt6->rt6i_flags |= RTF_MODIFIED; - dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); - } - } + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); + if (IS_ERR(skb)) + goto out; - if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - ip_rt_put(rt); - goto tx_error; - } - } -#endif + if (dev->header_ops) { + /* Need space for new headers */ + if (skb_cow_head(skb, dev->needed_headroom - + (tunnel->hlen + sizeof(struct iphdr)))) + goto free_skb; - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; + tnl_params = (const struct iphdr *)skb->data; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } + /* Pull skb since ip_tunnel_xmit() needs skb->data pointing + * to gre header. + */ + skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + } else { + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; - - if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; - if (!new_skb) { - ip_rt_put(rt); - dev->stats.tx_dropped++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - if (skb->sk) - skb_set_owner_w(new_skb, skb->sk); - dev_kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); + tnl_params = &tunnel->parms.iph; } - skb_reset_transport_header(skb); - skb_push(skb, gre_hlen); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - - /* - * Push down and install the IPIP header. - */ - - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = IPPROTO_GRE; - iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; - - if ((iph->ttl = tiph->ttl) == 0) { - if (skb->protocol == htons(ETH_P_IP)) - iph->ttl = old_iph->ttl; -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) - iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; -#endif - else - iph->ttl = ip4_dst_hoplimit(&rt->dst); - } - - ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; - ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : skb->protocol; - - if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); - - if (tunnel->parms.o_flags&GRE_SEQ) { - ++tunnel->o_seqno; - *ptr = htonl(tunnel->o_seqno); - ptr--; - } - if (tunnel->parms.o_flags&GRE_KEY) { - *ptr = tunnel->parms.o_key; - ptr--; - } - if (tunnel->parms.o_flags&GRE_CSUM) { - *ptr = 0; - *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr)); - } - } + __gre_xmit(skb, dev, tnl_params, skb->protocol); - nf_reset(skb); - tstats = this_cpu_ptr(dev->tstats); - __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; -#if IS_ENABLED(CONFIG_IPV6) -tx_error_icmp: - dst_link_failure(skb); -#endif -tx_error: - dev->stats.tx_errors++; - dev_kfree_skb(skb); +free_skb: + kfree_skb(skb); +out: + dev->stats.tx_dropped++; return NETDEV_TX_OK; } -static int ipgre_tunnel_bind_dev(struct net_device *dev) +static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, + struct net_device *dev) { - struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; - const struct iphdr *iph; - int hlen = LL_MAX_HEADER; - int mtu = ETH_DATA_LEN; - int addend = sizeof(struct iphdr) + 4; - - tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; - - /* Guess output device to choose reasonable mtu and needed_headroom */ - - if (iph->daddr) { - struct flowi4 fl4; - struct rtable *rt; - - rt = ip_route_output_gre(dev_net(dev), &fl4, - iph->daddr, iph->saddr, - tunnel->parms.o_key, - RT_TOS(iph->tos), - tunnel->parms.link); - if (!IS_ERR(rt)) { - tdev = rt->dst.dev; - ip_rt_put(rt); - } - - if (dev->type != ARPHRD_ETHER) - dev->flags |= IFF_POINTOPOINT; - } + struct ip_tunnel *tunnel = netdev_priv(dev); - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); + if (IS_ERR(skb)) + goto out; - if (tdev) { - hlen = tdev->hard_header_len + tdev->needed_headroom; - mtu = tdev->mtu; - } - dev->iflink = tunnel->parms.link; - - /* Precalculate GRE options length */ - if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { - if (tunnel->parms.o_flags&GRE_CSUM) - addend += 4; - if (tunnel->parms.o_flags&GRE_KEY) - addend += 4; - if (tunnel->parms.o_flags&GRE_SEQ) - addend += 4; - } - dev->needed_headroom = addend + hlen; - mtu -= dev->hard_header_len + addend; + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; - if (mtu < 68) - mtu = 68; + __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); - tunnel->hlen = addend; + return NETDEV_TX_OK; - return mtu; +free_skb: + kfree_skb(skb); +out: + dev->stats.tx_dropped++; + return NETDEV_TX_OK; } -static int -ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ioctl(struct net_device *dev, + struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - switch (cmd) { - case SIOCGETTUNNEL: - t = NULL; - if (dev == ign->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipgre_tunnel_locate(net, &p, 0); - } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - break; - - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - goto done; - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - if (!(p.i_flags&GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags&GRE_KEY)) - p.o_key = 0; - - t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - unsigned int nflags = 0; - - t = netdev_priv(dev); - - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; - - if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { - err = -EINVAL; - break; - } - ipgre_tunnel_unlink(ign, t); - synchronize_net(); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - t->parms.i_key = p.i_key; - t->parms.o_key = p.o_key; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - ipgre_tunnel_link(ign, t); - netdev_state_change(dev); - } - } - - if (t) { - err = 0; - if (cmd == SIOCCHGTUNNEL) { - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - t->parms.iph.frag_off = p.iph.frag_off; - if (t->parms.link != p.link) { - t->parms.link = p.link; - dev->mtu = ipgre_tunnel_bind_dev(dev); - netdev_state_change(dev); - } - } - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; - - case SIOCDELTUNNEL: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - goto done; - - if (dev == ign->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) - goto done; - err = -EPERM; - if (t == netdev_priv(ign->fb_tunnel_dev)) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; - - default: - err = -EINVAL; + return -EINVAL; } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); + p.o_flags = gre_flags_to_tnl_flags(p.o_flags); -done: - return err; -} + err = ip_tunnel_ioctl(dev, &p, cmd); + if (err) + return err; -static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) - return -EINVAL; - dev->mtu = new_mtu; + p.i_flags = tnl_flags_to_gre_flags(p.i_flags); + p.o_flags = tnl_flags_to_gre_flags(p.o_flags); + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; return 0; } @@ -1181,33 +362,31 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) ... ftp fec0:6666:6666::193.233.7.65 ... - */ - static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); - struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(iph+1); + struct iphdr *iph; + struct gre_base_hdr *greh; - memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); - p[0] = t->parms.o_flags; - p[1] = htons(type); + iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); + greh = (struct gre_base_hdr *)(iph+1); + greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); + greh->protocol = htons(type); - /* - * Set the source hardware address. - */ + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + /* Set the source hardware address. */ if (saddr) memcpy(&iph->saddr, saddr, 4); if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) - return t->hlen; + return t->hlen + sizeof(*iph); - return -t->hlen; + return -(t->hlen + sizeof(*iph)); } static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) @@ -1231,7 +410,7 @@ static int ipgre_open(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_gre(dev_net(dev), &fl4, + rt = ip_route_output_gre(t->net, &fl4, t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, @@ -1255,62 +434,77 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; - in_dev = inetdev_by_index(dev_net(dev), t->mlink); + in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } return 0; } - #endif static const struct net_device_ops ipgre_netdev_ops = { .ndo_init = ipgre_tunnel_init, - .ndo_uninit = ipgre_tunnel_uninit, + .ndo_uninit = ip_tunnel_uninit, #ifdef CONFIG_NET_IPGRE_BROADCAST .ndo_open = ipgre_open, .ndo_stop = ipgre_close, #endif - .ndo_start_xmit = ipgre_tunnel_xmit, + .ndo_start_xmit = ipgre_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, - .ndo_change_mtu = ipgre_tunnel_change_mtu, - .ndo_get_stats64 = ipgre_get_stats64, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; -static void ipgre_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); - free_netdev(dev); -} +#define GRE_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; - dev->destructor = ipgre_dev_free; - dev->type = ARPHRD_IPGRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; - dev->flags = IFF_NOARP; - dev->iflink = 0; - dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + ip_tunnel_setup(dev, ipgre_net_id); } -static int ipgre_tunnel_init(struct net_device *dev) +static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - struct iphdr *iph; tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; + tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->parms.iph.protocol = IPPROTO_GRE; + + dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + + dev->features |= GRE_FEATURES; + dev->hw_features |= GRE_FEATURES; + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported. */ + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + +static int ipgre_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + __gre_tunnel_init(dev); - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); - memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + memcpy(dev->dev_addr, &iph->saddr, 4); + memcpy(dev->broadcast, &iph->daddr, 4); + + dev->flags = IFF_NOARP; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + dev->addr_len = 4; if (iph->daddr) { #ifdef CONFIG_NET_IPGRE_BROADCAST @@ -1324,100 +518,31 @@ static int ipgre_tunnel_init(struct net_device *dev) } else dev->header_ops = &ipgre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; -} - -static void ipgre_fb_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *iph = &tunnel->parms.iph; - - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); - - iph->version = 4; - iph->protocol = IPPROTO_GRE; - iph->ihl = 5; - tunnel->hlen = sizeof(struct iphdr) + 4; - - dev_hold(dev); + return ip_tunnel_init(dev); } - -static const struct gre_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, +static struct gre_cisco_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, + .priority = 0, }; -static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) -{ - int prio; - - for (prio = 0; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - - t = rtnl_dereference(ign->tunnels[prio][h]); - - while (t != NULL) { - unregister_netdevice_queue(t->dev, head); - t = rtnl_dereference(t->next); - } - } - } -} - static int __net_init ipgre_init_net(struct net *net) { - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - int err; - - ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", - ipgre_tunnel_setup); - if (!ign->fb_tunnel_dev) { - err = -ENOMEM; - goto err_alloc_dev; - } - dev_net_set(ign->fb_tunnel_dev, net); - - ipgre_fb_tunnel_init(ign->fb_tunnel_dev); - ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; - - if ((err = register_netdev(ign->fb_tunnel_dev))) - goto err_reg_dev; - - rcu_assign_pointer(ign->tunnels_wc[0], - netdev_priv(ign->fb_tunnel_dev)); - return 0; - -err_reg_dev: - ipgre_dev_free(ign->fb_tunnel_dev); -err_alloc_dev: - return err; + return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } static void __net_exit ipgre_exit_net(struct net *net) { - struct ipgre_net *ign; - LIST_HEAD(list); - - ign = net_generic(net, ipgre_net_id); - rtnl_lock(); - ipgre_destroy_tunnels(ign, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); + ip_tunnel_delete_net(itn, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, .exit = ipgre_exit_net, .id = &ipgre_net_id, - .size = sizeof(struct ipgre_net), + .size = sizeof(struct ip_tunnel_net), }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1462,8 +587,8 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) +static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], + struct ip_tunnel_parm *parms) { memset(parms, 0, sizeof(*parms)); @@ -1476,10 +601,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1503,145 +628,47 @@ static void ipgre_netlink_parms(struct nlattr *data[], parms->iph.frag_off = htons(IP_DF); } -static int ipgre_tap_init(struct net_device *dev) +static int gre_tap_init(struct net_device *dev) { - struct ip_tunnel *tunnel; + __gre_tunnel_init(dev); - tunnel = netdev_priv(dev); - - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); - - ipgre_tunnel_bind_dev(dev); - - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; + return ip_tunnel_init(dev); } -static const struct net_device_ops ipgre_tap_netdev_ops = { - .ndo_init = ipgre_tap_init, - .ndo_uninit = ipgre_tunnel_uninit, - .ndo_start_xmit = ipgre_tunnel_xmit, +static const struct net_device_ops gre_tap_netdev_ops = { + .ndo_init = gre_tap_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = gre_tap_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = ipgre_tunnel_change_mtu, - .ndo_get_stats64 = ipgre_get_stats64, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; static void ipgre_tap_setup(struct net_device *dev) { - ether_setup(dev); - - dev->netdev_ops = &ipgre_tap_netdev_ops; - dev->destructor = ipgre_dev_free; - - dev->iflink = 0; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + ip_tunnel_setup(dev, gre_tap_net_id); } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) +static int ipgre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *nt; - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - int mtu; - int err; - - nt = netdev_priv(dev); - ipgre_netlink_parms(data, &nt->parms); - - if (ipgre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; - - if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) - eth_hw_addr_random(dev); - - mtu = ipgre_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - - /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) - dev->features |= NETIF_F_LLTX; - - err = register_netdevice(dev); - if (err) - goto out; - - dev_hold(dev); - ipgre_tunnel_link(ign, nt); + struct ip_tunnel_parm p; -out: - return err; + ipgre_netlink_parms(data, tb, &p); + return ip_tunnel_newlink(dev, tb, &p); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *t, *nt; - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); struct ip_tunnel_parm p; - int mtu; - - if (dev == ign->fb_tunnel_dev) - return -EINVAL; - - nt = netdev_priv(dev); - ipgre_netlink_parms(data, &p); - - t = ipgre_tunnel_locate(net, &p, 0); - - if (t) { - if (t->dev != dev) - return -EEXIST; - } else { - t = nt; - - if (dev->type != ARPHRD_ETHER) { - unsigned int nflags = 0; - - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; - - if ((dev->flags ^ nflags) & - (IFF_POINTOPOINT | IFF_BROADCAST)) - return -EINVAL; - } - - ipgre_tunnel_unlink(ign, t); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - t->parms.i_key = p.i_key; - if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - } - ipgre_tunnel_link(ign, t); - netdev_state_change(dev); - } - - t->parms.o_key = p.o_key; - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - t->parms.iph.frag_off = p.iph.frag_off; - - if (t->parms.link != p.link) { - t->parms.link = p.link; - mtu = ipgre_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - netdev_state_change(dev); - } - return 0; + ipgre_netlink_parms(data, tb, &p); + return ip_tunnel_changelink(dev, tb, &p); } static size_t ipgre_get_size(const struct net_device *dev) @@ -1676,8 +703,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || @@ -1715,6 +742,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = { .validate = ipgre_tunnel_validate, .newlink = ipgre_newlink, .changelink = ipgre_changelink, + .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, }; @@ -1728,13 +756,28 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .validate = ipgre_tap_validate, .newlink = ipgre_newlink, .changelink = ipgre_changelink, + .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, }; -/* - * And now the modules code and kernel interface. - */ +static int __net_init ipgre_tap_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); +} + +static void __net_exit ipgre_tap_exit_net(struct net *net) +{ + struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); + ip_tunnel_delete_net(itn, &ipgre_tap_ops); +} + +static struct pernet_operations ipgre_tap_net_ops = { + .init = ipgre_tap_init_net, + .exit = ipgre_tap_exit_net, + .id = &gre_tap_net_id, + .size = sizeof(struct ip_tunnel_net), +}; static int __init ipgre_init(void) { @@ -1746,7 +789,11 @@ static int __init ipgre_init(void) if (err < 0) return err; - err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); + err = register_pernet_device(&ipgre_tap_net_ops); + if (err < 0) + goto pnet_tap_faied; + + err = gre_cisco_register(&ipgre_protocol); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; @@ -1760,24 +807,25 @@ static int __init ipgre_init(void) if (err < 0) goto tap_ops_failed; -out: - return err; + return 0; tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); + gre_cisco_unregister(&ipgre_protocol); add_proto_failed: + unregister_pernet_device(&ipgre_tap_net_ops); +pnet_tap_faied: unregister_pernet_device(&ipgre_net_ops); - goto out; + return err; } static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) - pr_info("%s: can't remove protocol\n", __func__); + gre_cisco_unregister(&ipgre_protocol); + unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); } @@ -1787,3 +835,4 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); MODULE_ALIAS_NETDEV("gre0"); +MODULE_ALIAS_NETDEV("gretap0"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 8590144ca33..3d4da2c16b6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -141,6 +141,7 @@ #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> +#include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> @@ -190,32 +191,21 @@ static int ip_local_deliver_finish(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); - __skb_pull(skb, ip_hdrlen(skb)); - - /* Point into the IP datagram, just past the header. */ - skb_reset_transport_header(skb); + __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); { int protocol = ip_hdr(skb)->protocol; - int hash, raw; const struct net_protocol *ipprot; + int raw; resubmit: raw = raw_local_deliver(skb, protocol); - hash = protocol & (MAX_INET_PROTOS - 1); - ipprot = rcu_dereference(inet_protos[hash]); + ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot != NULL) { int ret; - if (!net_eq(net, &init_net) && !ipprot->netns_ok) { - net_info_ratelimited("%s: proto %d isn't netns-ready\n", - __func__, protocol); - kfree_skb(skb); - goto out; - } - if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); @@ -236,9 +226,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } - } else + kfree_skb(skb); + } else { IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); + consume_skb(skb); + } } } out: @@ -314,26 +306,35 @@ drop: return true; } +int sysctl_ip_early_demux __read_mostly = 1; +EXPORT_SYMBOL(sysctl_ip_early_demux); + static int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->early_demux) { + ipprot->early_demux(skb); + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + } + /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb_dst(skb) == NULL) { + if (!skb_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { - if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INADDRERRORS); - else if (err == -ENETUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INNOROUTES); - else if (err == -EXDEV) + if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), LINUX_MIB_IPRPFILTER); goto drop; @@ -410,13 +411,20 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; + BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); + BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); + BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); + IP_ADD_STATS_BH(dev_net(dev), + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - goto inhdr_error; + goto csum_error; len = ntohs(iph->tot_len); if (skb->len < len) { @@ -434,6 +442,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; } + skb->transport_header = skb->network_header + iph->ihl*4; + /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); @@ -443,6 +453,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); +csum_error: + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); inhdr_error: IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 708b99494e2..ad382499bac 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -27,6 +27,7 @@ #include <net/icmp.h> #include <net/route.h> #include <net/cipso_ipv4.h> +#include <net/ip_fib.h> /* * Write options to IP header, record destination address to @@ -92,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) unsigned char *sptr, *dptr; int soffset, doffset; int optlen; - __be32 daddr; memset(dopt, 0, sizeof(struct ip_options)); @@ -104,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) sptr = skb_network_header(skb); dptr = dopt->__data; - daddr = skb_rtable(skb)->rt_spec_dst; - if (sopt->rr) { optlen = sptr[sopt->rr+1]; soffset = sptr[sopt->rr+2]; @@ -169,7 +167,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); - for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. @@ -179,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) doffset -= 4; } if (doffset > 3) { + __be32 daddr = fib_compute_spec_dst(skb); + memcpy(&start[doffset-1], &daddr, 4); dopt->faddr = faddr; dptr[0] = start[0]; @@ -227,7 +227,7 @@ void ip_options_fragment(struct sk_buff *skb) continue; } optlen = optptr[1]; - if (optlen<2 || optlen>l) + if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); @@ -241,6 +241,15 @@ void ip_options_fragment(struct sk_buff *skb) opt->ts_needtime = 0; } +/* helper used by ip_options_compile() to call fib_compute_spec_dst() + * at most one time. + */ +static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) +{ + if (*spec_dst == htonl(INADDR_ANY)) + *spec_dst = fib_compute_spec_dst(skb); +} + /* * Verify options and fill pointers in struct options. * Caller should clear *opt, and set opt->data. @@ -250,12 +259,12 @@ void ip_options_fragment(struct sk_buff *skb) int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) { - int l; - unsigned char *iph; - unsigned char *optptr; - int optlen; + __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; struct rtable *rt = NULL; + unsigned char *optptr; + unsigned char *iph; + int optlen, l; if (skb != NULL) { rt = skb_rtable(skb); @@ -266,27 +275,31 @@ int ip_options_compile(struct net *net, for (l = opt->optlen; l > 0; ) { switch (*optptr) { - case IPOPT_END: - for (optptr++, l--; l>0; optptr++, l--) { + case IPOPT_END: + for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; - case IPOPT_NOOP: + case IPOPT_NOOP: l--; optptr++; continue; } + if (unlikely(l < 2)) { + pp_ptr = optptr; + goto error; + } optlen = optptr[1]; - if (optlen<2 || optlen>l) { + if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { - case IPOPT_SSRR: - case IPOPT_LSRR: + case IPOPT_SSRR: + case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; @@ -312,7 +325,7 @@ int ip_options_compile(struct net *net, opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; - case IPOPT_RR: + case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; @@ -331,7 +344,8 @@ int ip_options_compile(struct net *net, goto error; } if (rt) { - memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + spec_dst_fill(&spec_dst, skb); + memcpy(&optptr[optptr[2]-1], &spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; @@ -339,7 +353,7 @@ int ip_options_compile(struct net *net, } opt->rr = optptr - iph; break; - case IPOPT_TIMESTAMP: + case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; @@ -354,38 +368,36 @@ int ip_options_compile(struct net *net, } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; - if (optptr[2]+3 > optptr[1]) { + if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { - case IPOPT_TS_TSONLY: - opt->ts = optptr - iph; + case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; - case IPOPT_TS_TSANDADDR: - if (optptr[2]+7 > optptr[1]) { + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; if (rt) { - memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + spec_dst_fill(&spec_dst, skb); + memcpy(&optptr[optptr[2]-1], &spec_dst, 4); timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; optptr[2] += 8; break; - case IPOPT_TS_PRESPEC: - if (optptr[2]+7 > optptr[1]) { + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); @@ -397,8 +409,8 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - default: - if (!skb && !capable(CAP_NET_RAW)) { + default: + if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; } @@ -412,20 +424,20 @@ int ip_options_compile(struct net *net, put_unaligned_be32(midtime, timeptr); opt->is_changed = 1; } - } else { + } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; } - opt->ts = optptr - iph; if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } + opt->ts = optptr - iph; break; - case IPOPT_RA: + case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; @@ -433,8 +445,8 @@ int ip_options_compile(struct net *net, if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; - case IPOPT_CIPSO: - if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) { + case IPOPT_CIPSO: + if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; } @@ -444,10 +456,10 @@ int ip_options_compile(struct net *net, goto error; } break; - case IPOPT_SEC: - case IPOPT_SID: - default: - if (!skb && !capable(CAP_NET_RAW)) { + case IPOPT_SEC: + case IPOPT_SID: + default: + if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; } @@ -564,7 +576,7 @@ void ip_forward_options(struct sk_buff *skb) optptr = raw + opt->srr; - for ( srrptr=optptr[2], srrspace = optptr[1]; + for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { @@ -620,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt->rt_type != RTN_LOCAL) return -EINVAL; - for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 451f97c42eb..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); @@ -101,30 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); - -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - skb_dst_force(newskb); - netif_rx_ni(newskb); - return 0; -} +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -161,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; + u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -200,19 +188,22 @@ static inline int ip_finish_output2(struct sk_buff *skb) } if (skb->sk) skb_set_owner_w(skb2, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = skb2; } - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); - if (neigh) { - int res = neigh_output(neigh, skb); + rcu_read_lock_bh(); + nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); + neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + if (!IS_ERR(neigh)) { + int res = dst_neigh_output(dst, neigh, skb); - rcu_read_unlock(); + rcu_read_unlock_bh(); return res; } - rcu_read_unlock(); + rcu_read_unlock_bh(); net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); @@ -220,12 +211,46 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) +static int ip_finish_output_gso(struct sk_buff *skb) { - struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); - return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; } static int ip_finish_output(struct sk_buff *skb) @@ -237,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -281,7 +308,7 @@ int ip_mc_output(struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, - ip_dev_loopback_xmit); + dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -296,7 +323,7 @@ int ip_mc_output(struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, ip_dev_loopback_xmit); + NULL, newskb->dev, dev_loopback_xmit); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, @@ -304,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -332,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) sizeof(fl4->saddr) + sizeof(fl4->daddr)); } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -380,7 +407,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ @@ -388,7 +415,7 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; @@ -403,9 +430,9 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -439,10 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) - to->nf_trace = from->nf_trace; -#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif @@ -476,10 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + mtu = ip_skb_dst_mtu(skb); + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(ip_skb_dst_mtu(skb))); + htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } @@ -489,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ + mtu = mtu - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -602,6 +628,11 @@ slow_path_clean: } slow_path: + /* for offloaded checksums cleanup checksum before fragmentation */ + if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) + goto fail; + iph = ip_hdr(skb); + left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ @@ -709,7 +740,7 @@ slow_path: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } - kfree_skb(skb); + consume_skb(skb); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; @@ -783,15 +814,20 @@ static inline int ip_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - /* specify the length of each IP datagram fragment */ - skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + __skb_queue_tail(queue, skb); + } else if (skb_is_gso(skb)) { + goto append; } + skb->ip_summed = CHECKSUM_PARTIAL; + /* specify the length of each IP datagram fragment */ + skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append: return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen)); } @@ -800,6 +836,7 @@ static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork, + struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, @@ -815,7 +852,7 @@ static int __ip_append_data(struct sock *sk, int copy; int err; int offset = 0; - unsigned int maxfraglen, fragheaderlen; + unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; @@ -828,10 +865,11 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - if (cork->length + length > 0xFFFF - fragheaderlen) { + if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, - mtu-exthdrlen); + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -994,47 +1032,30 @@ alloc_new_skb: } } else { int i = skb_shinfo(skb)->nr_frags; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = cork->page; - int off = cork->off; - unsigned int left; - - if (page && (left = PAGE_SIZE - off) > 0) { - if (copy >= left) - copy = left; - if (page != skb_frag_page(frag)) { - if (i == MAX_SKB_FRAGS) { - err = -EMSGSIZE; - goto error; - } - skb_fill_page_desc(skb, i, page, off, 0); - skb_frag_ref(skb, i); - frag = &skb_shinfo(skb)->frags[i]; - } - } else if (i < MAX_SKB_FRAGS) { - if (copy > PAGE_SIZE) - copy = PAGE_SIZE; - page = alloc_pages(sk->sk_allocation, 0); - if (page == NULL) { - err = -ENOMEM; - goto error; - } - cork->page = page; - cork->off = 0; - skb_fill_page_desc(skb, i, page, 0, 0); - frag = &skb_shinfo(skb)->frags[i]; - } else { - err = -EMSGSIZE; - goto error; - } - if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag), - offset, copy, skb->len, skb) < 0) { - err = -EFAULT; + err = -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) goto error; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + err = -EMSGSIZE; + if (i == MAX_SKB_FRAGS) + goto error; + + __skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, 0); + skb_shinfo(skb)->nr_frags = ++i; + get_page(pfrag->page); } - cork->off += copy; - skb_frag_size_add(frag, copy); + copy = min_t(int, copy, pfrag->size - pfrag->offset); + if (getfrag(from, + page_address(pfrag->page) + pfrag->offset, + offset, copy, skb->len, skb) < 0) + goto error_efault; + + pfrag->offset += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -1046,6 +1067,8 @@ alloc_new_skb: return 0; +error_efault: + err = -EFAULT; error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); @@ -1055,7 +1078,6 @@ error: static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { - struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *opt; struct rtable *rt; @@ -1081,13 +1103,14 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, * We steal reference to this route, caller should not release it */ *rtp = NULL; - cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + cork->fragsize = ip_sk_use_pmtu(sk) ? + dst_mtu(&rt->dst) : rt->dst.dev->mtu; cork->dst = &rt->dst; cork->length = 0; + cork->ttl = ipc->ttl; + cork->tos = ipc->tos; + cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; - cork->page = NULL; - cork->off = 0; return 0; } @@ -1124,7 +1147,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4, transhdrlen = 0; } - return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, + return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, + sk_page_frag(sk), getfrag, from, length, transhdrlen, flags); } @@ -1140,7 +1164,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int mtu; int len; int err; - unsigned int maxfraglen, fragheaderlen, fraggap; + unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; if (inet->hdrincl) return -EPERM; @@ -1164,9 +1188,11 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - if (cork->length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); + if (cork->length + size > maxnonfragsize - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -1323,13 +1349,13 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc < IP_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. - * If local_df is set too, we still allow to fragment this frame + * If ignore_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc >= IP_PMTUDISC_DO || + if (inet->pmtudisc == IP_PMTUDISC_DO || + inet->pmtudisc == IP_PMTUDISC_PROBE || (skb->len <= dst_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); @@ -1337,27 +1363,29 @@ struct sk_buff *__ip_make_skb(struct sock *sk, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (rt->rt_type == RTN_MULTICAST) + if (cork->ttl != 0) + ttl = cork->ttl; + else if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else ttl = ip_select_ttl(inet, &rt->dst); - iph = (struct iphdr *)skb->data; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = inet->tos; + iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); + ip_select_ident(skb, sk); if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, cork->addr, rt, 0); } - skb->priority = sk->sk_priority; + skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1375,9 +1403,8 @@ out: return skb; } -int ip_send_skb(struct sk_buff *skb) +int ip_send_skb(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); int err; err = ip_local_out(skb); @@ -1400,7 +1427,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) return 0; /* Netfilter gets whole the not fragmented skb. */ - return ip_send_skb(skb); + return ip_send_skb(sock_net(sk), skb); } /* @@ -1447,7 +1474,8 @@ struct sk_buff *ip_make_skb(struct sock *sk, if (err) return ERR_PTR(err); - err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, + err = __ip_append_data(sk, fl4, &queue, &cork, + ¤t->task_frag, getfrag, from, length, transhdrlen, flags); if (err) { __ip_flush_pending_frames(sk, &queue, &cork); @@ -1472,19 +1500,34 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. + * Used to send some TCP resets/acks so far. * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. + * Use a fake percpu inet socket to avoid false sharing and contention. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - const struct ip_reply_arg *arg, unsigned int len) +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { + .sk = { + .__sk_common = { + .skc_refcnt = ATOMIC_INIT(1), + }, + .sk_wmem_alloc = ATOMIC_INIT(1), + .sk_allocation = GFP_ATOMIC, + .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + }, + .pmtudisc = IP_PMTUDISC_WANT, + .uc_ttl = -1, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len) { - struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct sk_buff *nskb; + struct sock *sk; + struct inet_sock *inet; if (ip_options_echo(&replyopts.opt.opt, skb)) return; @@ -1492,6 +1535,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, ipc.addr = daddr; ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; @@ -1500,40 +1545,43 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, 0, + flowi4_init_output(&fl4, arg->bound_dev_if, + IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), - RT_SCOPE_UNIVERSE, sk->sk_protocol, + RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), - daddr, rt->rt_spec_dst, + daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); + rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; - /* And let IP do all the hard work. + inet = &get_cpu_var(unicast_sock); - This chunk is not reenterable, hence spinlock. - Note that it uses the fact, that this function is called - with locally disabled BH and that sk cannot be already spinlocked. - */ - bh_lock_sock(sk); inet->tos = arg->tos; + sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; + sock_net_set(sk, net); + __skb_queue_head_init(&sk->sk_write_queue); + sk->sk_sndbuf = sysctl_wmem_default; ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + nskb = skb_peek(&sk->sk_write_queue); + if (nskb) { if (arg->csumoffset >= 0) - *((__sum16 *)skb_transport_header(skb) + - arg->csumoffset) = csum_fold(csum_add(skb->csum, + *((__sum16 *)skb_transport_header(nskb) + + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); - skb->ip_summed = CHECKSUM_NONE; + nskb->ip_summed = CHECKSUM_NONE; + skb_orphan(nskb); + skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } - bh_unlock_sock(sk); + put_cpu_var(unicast_sock); ip_rt_put(rt); } @@ -1543,7 +1591,7 @@ void __init ip_init(void) ip_rt_init(); inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) - igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) + igmp_mc_init(); #endif } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 0d11f234d61..64741b93863 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -40,6 +40,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/transp_v6.h> #endif +#include <net/ip_fib.h> #include <linux/errqueue.h> #include <asm/uaccess.h> @@ -55,7 +56,6 @@ /* * SOL_IP control messages. */ -#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { @@ -186,14 +186,31 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) } EXPORT_SYMBOL(ip_cmsg_recv); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, + bool allow_ipv6) { - int err; + int err, val; struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; +#if defined(CONFIG_IPV6) + if (allow_ipv6 && + cmsg->cmsg_level == SOL_IPV6 && + cmsg->cmsg_type == IPV6_PKTINFO) { + struct in6_pktinfo *src_info; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) + return -EINVAL; + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) + return -EINVAL; + ipc->oif = src_info->ipi6_ifindex; + ipc->addr = src_info->ipi6_addr.s6_addr32[3]; + continue; + } +#endif if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -214,6 +231,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) ipc->addr = info->ipi_spec_dst.s_addr; break; } + case IP_TTL: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 1 || val > 255) + return -EINVAL; + ipc->ttl = val; + break; + case IP_TOS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 0 || val > 255) + return -EINVAL; + ipc->tos = val; + ipc->priority = rt_tos2priority(ipc->tos); + break; + default: return -EINVAL; } @@ -367,11 +402,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf /* * Handle MSG_ERRQUEUE */ -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; @@ -397,13 +432,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -456,19 +491,28 @@ static int do_ip_setsockopt(struct sock *sk, int level, struct inet_sock *inet = inet_sk(sk); int val = 0, err; - if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | - (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | - (1<<IP_RETOPTS) | (1<<IP_TOS) | - (1<<IP_TTL) | (1<<IP_HDRINCL) | - (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | - (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | - (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) || - optname == IP_UNICAST_IF || - optname == IP_MULTICAST_TTL || - optname == IP_MULTICAST_ALL || - optname == IP_MULTICAST_LOOP || - optname == IP_RECVORIGDSTADDR) { + switch (optname) { + case IP_PKTINFO: + case IP_RECVTTL: + case IP_RECVOPTS: + case IP_RECVTOS: + case IP_RETOPTS: + case IP_TOS: + case IP_TTL: + case IP_HDRINCL: + case IP_MTU_DISCOVER: + case IP_RECVERR: + case IP_ROUTER_ALERT: + case IP_FREEBIND: + case IP_PASSSEC: + case IP_TRANSPARENT: + case IP_MINTTL: + case IP_NODEFRAG: + case IP_UNICAST_IF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_ALL: + case IP_MULTICAST_LOOP: + case IP_RECVORIGDSTADDR: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -580,7 +624,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TTL: if (optlen < 1) goto e_inval; - if (val != -1 && (val < 0 || val > 255)) + if (val != -1 && (val < 1 || val > 255)) goto e_inval; inet->uc_ttl = val; break; @@ -599,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->nodefrag = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; inet->pmtudisc = val; break; @@ -979,13 +1023,14 @@ mc_msf_out: case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; case IP_TRANSPARENT: - if (!!val && !capable(CAP_NET_RAW) && !capable(CAP_NET_ADMIN)) { + if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { err = -EPERM; break; } @@ -1019,18 +1064,19 @@ e_inval: * @sk: socket * @skb: buffer * - * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst - * in skb->cb[] before dst drop. - * This way, receiver doesnt make cache line misses to read rtable. + * To support IP_CMSG_PKTINFO option, we store rt_iif and specific + * destination in skb->cb[] before dst drop. + * This way, receiver doesn't make cache line misses to read rtable. */ -void ipv4_pktinfo_prepare(struct sk_buff *skb) +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - const struct rtable *rt = skb_rtable(skb); + bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || + ipv6_sk_rxinfo(sk); - if (rt) { - pktinfo->ipi_ifindex = rt->rt_iif; - pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; + if (prepare && skb_rtable(skb)) { + pktinfo->ipi_ifindex = inet_iif(skb); + pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c new file mode 100644 index 00000000000..6f9de61dce5 --- /dev/null +++ b/net/ipv4/ip_tunnel.c @@ -0,0 +1,1062 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/rculist.h> +#include <linux/err.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) +{ + return hash_32((__force u32)key ^ (__force u32)remote, + IP_TNL_HASH_BITS); +} + +static void __tunnel_dst_set(struct ip_tunnel_dst *idst, + struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + dst_clone(dst); + old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); + dst_release(old_dst); +} + +static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ + __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); +} + +static void tunnel_dst_reset(struct ip_tunnel *t) +{ + tunnel_dst_set(t, NULL); +} + +void ip_tunnel_dst_reset_all(struct ip_tunnel *t) +{ + int i; + + for_each_possible_cpu(i) + __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); +} +EXPORT_SYMBOL(ip_tunnel_dst_reset_all); + +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +{ + struct dst_entry *dst; + + rcu_read_lock(); + dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + if (dst) { + if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + tunnel_dst_reset(t); + dst_release(dst); + dst = NULL; + } + } + rcu_read_unlock(); + return (struct rtable *)dst; +} + +static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, + __be16 flags, __be32 key) +{ + if (p->i_flags & TUNNEL_KEY) { + if (flags & TUNNEL_KEY) + return key == p->i_key; + else + /* key expected, none present */ + return false; + } else + return !(flags & TUNNEL_KEY); +} + +/* Fallback tunnel: no source, no destination, no key, no options + + Tunnel hash table: + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + Given src, dst and key, find appropriate for input tunnel. +*/ +struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, + int link, __be16 flags, + __be32 remote, __be32 local, + __be32 key) +{ + unsigned int hash; + struct ip_tunnel *t, *cand = NULL; + struct hlist_head *head; + + hash = ip_tunnel_hash(key, remote); + head = &itn->tunnels[hash]; + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (local != t->parms.iph.saddr || + remote != t->parms.iph.daddr || + !(t->dev->flags & IFF_UP)) + continue; + + if (!ip_tunnel_key_match(&t->parms, flags, key)) + continue; + + if (t->parms.link == link) + return t; + else + cand = t; + } + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (remote != t->parms.iph.daddr || + t->parms.iph.saddr != 0 || + !(t->dev->flags & IFF_UP)) + continue; + + if (!ip_tunnel_key_match(&t->parms, flags, key)) + continue; + + if (t->parms.link == link) + return t; + else if (!cand) + cand = t; + } + + hash = ip_tunnel_hash(key, 0); + head = &itn->tunnels[hash]; + + hlist_for_each_entry_rcu(t, head, hash_node) { + if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && + (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) + continue; + + if (!(t->dev->flags & IFF_UP)) + continue; + + if (!ip_tunnel_key_match(&t->parms, flags, key)) + continue; + + if (t->parms.link == link) + return t; + else if (!cand) + cand = t; + } + + if (flags & TUNNEL_NO_KEY) + goto skip_key_lookup; + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (t->parms.i_key != key || + t->parms.iph.saddr != 0 || + t->parms.iph.daddr != 0 || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->parms.link == link) + return t; + else if (!cand) + cand = t; + } + +skip_key_lookup: + if (cand) + return cand; + + if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) + return netdev_priv(itn->fb_tunnel_dev); + + + return NULL; +} +EXPORT_SYMBOL_GPL(ip_tunnel_lookup); + +static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, + struct ip_tunnel_parm *parms) +{ + unsigned int h; + __be32 remote; + __be32 i_key = parms->i_key; + + if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) + remote = parms->iph.daddr; + else + remote = 0; + + if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + i_key = 0; + + h = ip_tunnel_hash(i_key, remote); + return &itn->tunnels[h]; +} + +static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) +{ + struct hlist_head *head = ip_bucket(itn, &t->parms); + + hlist_add_head_rcu(&t->hash_node, head); +} + +static void ip_tunnel_del(struct ip_tunnel *t) +{ + hlist_del_init_rcu(&t->hash_node); +} + +static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, + struct ip_tunnel_parm *parms, + int type) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + __be32 key = parms->i_key; + __be16 flags = parms->i_flags; + int link = parms->link; + struct ip_tunnel *t = NULL; + struct hlist_head *head = ip_bucket(itn, parms); + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + link == t->parms.link && + type == t->dev->type && + ip_tunnel_key_match(&t->parms, flags, key)) + break; + } + return t; +} + +static struct net_device *__ip_tunnel_create(struct net *net, + const struct rtnl_link_ops *ops, + struct ip_tunnel_parm *parms) +{ + int err; + struct ip_tunnel *tunnel; + struct net_device *dev; + char name[IFNAMSIZ]; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + if (strlen(ops->kind) > (IFNAMSIZ - 3)) { + err = -E2BIG; + goto failed; + } + strlcpy(name, ops->kind, IFNAMSIZ); + strncat(name, "%d", 2); + } + + ASSERT_RTNL(); + dev = alloc_netdev(ops->priv_size, name, ops->setup); + if (!dev) { + err = -ENOMEM; + goto failed; + } + dev_net_set(dev, net); + + dev->rtnl_link_ops = ops; + + tunnel = netdev_priv(dev); + tunnel->parms = *parms; + tunnel->net = net; + + err = register_netdevice(dev); + if (err) + goto failed_free; + + return dev; + +failed_free: + free_netdev(dev); +failed: + return ERR_PTR(err); +} + +static inline void init_tunnel_flow(struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif) +{ + memset(fl4, 0, sizeof(*fl4)); + fl4->flowi4_oif = oif; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->flowi4_tos = tos; + fl4->flowi4_proto = proto; + fl4->fl4_gre_key = key; +} + +static int ip_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = ETH_DATA_LEN; + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + iph = &tunnel->parms.iph; + + /* Guess output device to choose reasonable mtu and needed_headroom */ + if (iph->daddr) { + struct flowi4 fl4; + struct rtable *rt; + + init_tunnel_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link); + rt = ip_route_output_key(tunnel->net, &fl4); + + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + tunnel_dst_set(tunnel, &rt->dst); + ip_rt_put(rt); + } + if (dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len + tdev->needed_headroom; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + dev->needed_headroom = t_hlen + hlen; + mtu -= (dev->hard_header_len + t_hlen); + + if (mtu < 68) + mtu = 68; + + return mtu; +} + +static struct ip_tunnel *ip_tunnel_create(struct net *net, + struct ip_tunnel_net *itn, + struct ip_tunnel_parm *parms) +{ + struct ip_tunnel *nt; + struct net_device *dev; + + BUG_ON(!itn->fb_tunnel_dev); + dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); + if (IS_ERR(dev)) + return ERR_CAST(dev); + + dev->mtu = ip_tunnel_bind_dev(dev); + + nt = netdev_priv(dev); + ip_tunnel_add(itn, nt); + return nt; +} + +int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, + const struct tnl_ptk_info *tpi, bool log_ecn_error) +{ + struct pcpu_sw_netstats *tstats; + const struct iphdr *iph = ip_hdr(skb); + int err; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(iph->daddr)) { + tunnel->dev->stats.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || + ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + + if (tunnel->parms.i_flags&TUNNEL_SEQ) { + if (!(tpi->flags&TUNNEL_SEQ) || + (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + tunnel->i_seqno = ntohl(tpi->seq) + 1; + } + + skb_reset_network_header(skb); + + err = IP_ECN_decapsulate(iph, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &iph->saddr, iph->tos); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto drop; + } + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); + + if (tunnel->dev->type == ARPHRD_ETHER) { + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } else { + skb->dev = tunnel->dev; + } + + gro_cells_receive(&tunnel->gro_cells, skb); + return 0; + +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_rcv); + +static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, + struct rtable *rt, __be16 df) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; + int mtu; + + if (df) + mtu = dst_mtu(&rt->dst) - dev->hard_header_len + - sizeof(struct iphdr) - tunnel->hlen; + else + mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + + if (skb->protocol == htons(ETH_P_IP)) { + if (!skb_is_gso(skb) && + (df & htons(IP_DF)) && mtu < pkt_size) { + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + return -E2BIG; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + + if (rt6 && mtu < dst_mtu(skb_dst(skb)) && + mtu >= IPV6_MIN_MTU) { + if ((tunnel->parms.iph.daddr && + !ipv4_is_multicast(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); + } + } + + if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && + mtu < pkt_size) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + return -E2BIG; + } + } +#endif + return 0; +} + +void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + const struct iphdr *tnl_params, const u8 protocol) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *inner_iph; + struct flowi4 fl4; + u8 tos, ttl; + __be16 df; + struct rtable *rt; /* Route to the other host */ + unsigned int max_headroom; /* The extra header space needed */ + __be32 dst; + int err; + bool connected; + + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + connected = (tunnel->parms.iph.daddr != 0); + + dst = tnl_params->daddr; + if (dst == 0) { + /* NBMA tunnel */ + + if (skb_dst(skb) == NULL) { + dev->stats.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == htons(ETH_P_IP)) { + rt = skb_rtable(skb); + dst = rt_nexthop(rt, inner_iph->daddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) { + const struct in6_addr *addr6; + struct neighbour *neigh; + bool do_tx_error_icmp; + int addr_type; + + neigh = dst_neigh_lookup(skb_dst(skb), + &ipv6_hdr(skb)->daddr); + if (neigh == NULL) + goto tx_error; + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + do_tx_error_icmp = true; + else { + do_tx_error_icmp = false; + dst = addr6->s6_addr32[3]; + } + neigh_release(neigh); + if (do_tx_error_icmp) + goto tx_error_icmp; + } +#endif + else + goto tx_error; + + connected = false; + } + + tos = tnl_params->tos; + if (tos & 0x1) { + tos &= ~0x1; + if (skb->protocol == htons(ETH_P_IP)) { + tos = inner_iph->tos; + connected = false; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + connected = false; + } + } + + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + + rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (connected) + tunnel_dst_set(tunnel, &rt->dst); + } + + if (rt->dst.dev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) { + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); + ttl = tnl_params->ttl; + if (ttl == 0) { + if (skb->protocol == htons(ETH_P_IP)) + ttl = inner_iph->ttl; +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) + ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; +#endif + else + ttl = ip4_dst_hoplimit(&rt->dst); + } + + df = tnl_params->frag_off; + if (skb->protocol == htons(ETH_P_IP)) + df |= (inner_iph->frag_off&htons(IP_DF)); + + max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + + rt->dst.header_len; + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; + + if (skb_cow_head(skb, dev->needed_headroom)) { + ip_rt_put(rt); + dev->stats.tx_dropped++; + kfree_skb(skb); + return; + } + + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, + tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + + return; + +#if IS_ENABLED(CONFIG_IPV6) +tx_error_icmp: + dst_link_failure(skb); +#endif +tx_error: + dev->stats.tx_errors++; + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(ip_tunnel_xmit); + +static void ip_tunnel_update(struct ip_tunnel_net *itn, + struct ip_tunnel *t, + struct net_device *dev, + struct ip_tunnel_parm *p, + bool set_mtu) +{ + ip_tunnel_del(t); + t->parms.iph.saddr = p->iph.saddr; + t->parms.iph.daddr = p->iph.daddr; + t->parms.i_key = p->i_key; + t->parms.o_key = p->o_key; + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p->iph.saddr, 4); + memcpy(dev->broadcast, &p->iph.daddr, 4); + } + ip_tunnel_add(itn, t); + + t->parms.iph.ttl = p->iph.ttl; + t->parms.iph.tos = p->iph.tos; + t->parms.iph.frag_off = p->iph.frag_off; + + if (t->parms.link != p->link) { + int mtu; + + t->parms.link = p->link; + mtu = ip_tunnel_bind_dev(dev); + if (set_mtu) + dev->mtu = mtu; + } + ip_tunnel_dst_reset_all(t); + netdev_state_change(dev); +} + +int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +{ + int err = 0; + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; + struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); + + BUG_ON(!itn->fb_tunnel_dev); + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == itn->fb_tunnel_dev) { + t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + if (t == NULL) + t = netdev_priv(dev); + } + memcpy(p, &t->parms, sizeof(*p)); + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + if (p->iph.ttl) + p->iph.frag_off |= htons(IP_DF); + if (!(p->i_flags & VTI_ISVTI)) { + if (!(p->i_flags & TUNNEL_KEY)) + p->i_key = 0; + if (!(p->o_flags & TUNNEL_KEY)) + p->o_key = 0; + } + + t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + + if (!t && (cmd == SIOCADDTUNNEL)) { + t = ip_tunnel_create(net, itn, p); + err = PTR_ERR_OR_ZERO(t); + break; + } + if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + unsigned int nflags = 0; + + if (ipv4_is_multicast(p->iph.daddr)) + nflags = IFF_BROADCAST; + else if (p->iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { + err = -EINVAL; + break; + } + + t = netdev_priv(dev); + } + } + + if (t) { + err = 0; + ip_tunnel_update(itn, t, dev, p, true); + } else { + err = -ENOENT; + } + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + if (dev == itn->fb_tunnel_dev) { + err = -ENOENT; + t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + if (t == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(itn->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} +EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); + +static void ip_tunnel_dev_free(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + gro_cells_destroy(&tunnel->gro_cells); + free_percpu(tunnel->dst_cache); + free_percpu(dev->tstats); + free_netdev(dev); +} + +void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_net *itn; + + itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); + + if (itn->fb_tunnel_dev != dev) { + ip_tunnel_del(netdev_priv(dev)); + unregister_netdevice_queue(dev, head); + } +} +EXPORT_SYMBOL_GPL(ip_tunnel_dellink); + +int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, + struct rtnl_link_ops *ops, char *devname) +{ + struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); + struct ip_tunnel_parm parms; + unsigned int i; + + for (i = 0; i < IP_TNL_HASH_SIZE; i++) + INIT_HLIST_HEAD(&itn->tunnels[i]); + + if (!ops) { + itn->fb_tunnel_dev = NULL; + return 0; + } + + memset(&parms, 0, sizeof(parms)); + if (devname) + strlcpy(parms.name, devname, IFNAMSIZ); + + rtnl_lock(); + itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + if (!IS_ERR(itn->fb_tunnel_dev)) { + itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); + ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); + } + rtnl_unlock(); + + return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); +} +EXPORT_SYMBOL_GPL(ip_tunnel_init_net); + +static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, + struct rtnl_link_ops *ops) +{ + struct net *net = dev_net(itn->fb_tunnel_dev); + struct net_device *dev, *aux; + int h; + + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == ops) + unregister_netdevice_queue(dev, head); + + for (h = 0; h < IP_TNL_HASH_SIZE; h++) { + struct ip_tunnel *t; + struct hlist_node *n; + struct hlist_head *thead = &itn->tunnels[h]; + + hlist_for_each_entry_safe(t, n, thead, hash_node) + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, head); + } +} + +void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +{ + LIST_HEAD(list); + + rtnl_lock(); + ip_tunnel_destroy(itn, &list, ops); + unregister_netdevice_many(&list); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); + +int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], + struct ip_tunnel_parm *p) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct ip_tunnel_net *itn; + int mtu; + int err; + + nt = netdev_priv(dev); + itn = net_generic(net, nt->ip_tnl_net_id); + + if (ip_tunnel_find(itn, p, dev->type)) + return -EEXIST; + + nt->net = net; + nt->parms = *p; + err = register_netdevice(dev); + if (err) + goto out; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + eth_hw_addr_random(dev); + + mtu = ip_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + ip_tunnel_add(itn, nt); + +out: + return err; +} +EXPORT_SYMBOL_GPL(ip_tunnel_newlink); + +int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], + struct ip_tunnel_parm *p) +{ + struct ip_tunnel *t; + struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; + struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + + if (dev == itn->fb_tunnel_dev) + return -EINVAL; + + t = ip_tunnel_find(itn, p, dev->type); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + t = tunnel; + + if (dev->type != ARPHRD_ETHER) { + unsigned int nflags = 0; + + if (ipv4_is_multicast(p->iph.daddr)) + nflags = IFF_BROADCAST; + else if (p->iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + } + } + + ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_changelink); + +int ip_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + int err; + + dev->destructor = ip_tunnel_dev_free; + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; + } + + err = gro_cells_init(&tunnel->gro_cells, dev); + if (err) { + free_percpu(tunnel->dst_cache); + free_percpu(dev->tstats); + return err; + } + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + iph->version = 4; + iph->ihl = 5; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_init); + +void ip_tunnel_uninit(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; + struct ip_tunnel_net *itn; + + itn = net_generic(net, tunnel->ip_tnl_net_id); + /* fb_tunnel_dev will be unregisted in net-exit call. */ + if (itn->fb_tunnel_dev != dev) + ip_tunnel_del(netdev_priv(dev)); + + ip_tunnel_dst_reset_all(tunnel); +} +EXPORT_SYMBOL_GPL(ip_tunnel_uninit); + +/* Do least required initialization, rest of init is done in tunnel_init call */ +void ip_tunnel_setup(struct net_device *dev, int net_id) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + tunnel->ip_tnl_net_id = net_id; +} +EXPORT_SYMBOL_GPL(ip_tunnel_setup); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c new file mode 100644 index 00000000000..f4c987bb7e9 --- /dev/null +++ b/net/ipv4/ip_tunnel_core.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> + +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 proto, + __u8 tos, __u8 ttl, __be16 df, bool xnet) +{ + int pkt_len = skb->len; + struct iphdr *iph; + int err; + + skb_scrub_packet(skb, xnet); + + skb_clear_hash(skb); + skb_dst_set(skb, &rt->dst); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = proto; + iph->tos = tos; + iph->daddr = dst; + iph->saddr = src; + iph->ttl = ttl; + __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); + + err = ip_local_out_sk(sk, skb); + if (unlikely(net_xmit_eval(err))) + pkt_len = 0; + return pkt_len; +} +EXPORT_SYMBOL_GPL(iptunnel_xmit); + +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +{ + if (unlikely(!pskb_may_pull(skb, hdr_len))) + return -ENOMEM; + + skb_pull_rcsum(skb, hdr_len); + + if (inner_proto == htons(ETH_P_TEB)) { + struct ethhdr *eh = (struct ethhdr *)skb->data; + + if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) + return -ENOMEM; + + if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + skb->protocol = eh->h_proto; + else + skb->protocol = htons(ETH_P_802_2); + + } else { + skb->protocol = inner_proto; + } + + nf_reset(skb); + secpath_reset(skb); + skb_clear_hash_if_not_l4(skb); + skb_dst_drop(skb); + skb->vlan_tci = 0; + skb_set_queue_mapping(skb, 0); + skb->pkt_type = PACKET_HOST; + return 0; +} +EXPORT_SYMBOL_GPL(iptunnel_pull_header); + +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, + bool csum_help, + int gso_type_mask) +{ + int err; + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + if (skb_is_gso(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + goto error; + skb_shinfo(skb)->gso_type |= gso_type_mask; + return skb; + } + + /* If packet is not gso and we are resolving any partial checksum, + * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL + * on the outer header without confusing devices that implement + * NETIF_F_IP_CSUM with encapsulation. + */ + if (csum_help) + skb->encapsulation = 0; + + if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_NONE; + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); + +/* Often modified stats are per cpu, other are shared (netdev->stats) */ +struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; + } + + tot->multicast = dev->stats.multicast; + + tot->rx_crc_errors = dev->stats.rx_crc_errors; + tot->rx_fifo_errors = dev->stats.rx_fifo_errors; + tot->rx_length_errors = dev->stats.rx_length_errors; + tot->rx_frame_errors = dev->stats.rx_frame_errors; + tot->rx_errors = dev->stats.rx_errors; + + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + + tot->collisions = dev->stats.collisions; + + return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c new file mode 100644 index 00000000000..b8960f3527f --- /dev/null +++ b/net/ipv4/ip_vti.c @@ -0,0 +1,603 @@ +/* + * Linux NET3: IP/IP protocol decoder modified to support + * virtual tunnel interface + * + * Authors: + * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* + This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/ip_tunnels.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +static struct rtnl_link_ops vti_link_ops __read_mostly; + +static int vti_net_id __read_mostly; +static int vti_tunnel_init(struct net_device *dev); + +static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel != NULL) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + skb->mark = be32_to_cpu(tunnel->parms.i_key); + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + +static int vti_rcv(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); +} + +static int vti_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + + if (!tunnel) + return 1; + + dev = tunnel->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + +static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)&dst; + xfrm_address_t *saddr = (xfrm_address_t *)&src; + + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET) + return false; + + if (!dst) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) + return false; + + return true; +} + +static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, + struct flowi *fl) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_parm *parms = &tunnel->parms; + struct dst_entry *dst = skb_dst(skb); + struct net_device *tdev; /* Device to other host */ + int err; + + if (!dst) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + + dst_hold(dst); + dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { + dev->stats.tx_carrier_errors++; + dst_release(dst); + goto tx_error_icmp; + } + + tdev = dst->dev; + + if (tdev == dev) { + dst_release(dst); + dev->stats.collisions++; + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = skb_dst(skb)->dev; + + err = dst_output(skb); + if (net_xmit_eval(err) == 0) + err = skb->len; + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return NETDEV_TX_OK; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + skb->mark = be32_to_cpu(tunnel->parms.o_key); + + switch (skb->protocol) { + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + default: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + return vti_xmit(skb, dev, &fl); +} + +static int vti4_err(struct sk_buff *skb, u32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip_tunnel *tunnel; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct iphdr *iph = (const struct iphdr *)skb->data; + int protocol = iph->protocol; + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!tunnel) + return -1; + + mark = be32_to_cpu(tunnel->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + +static int +vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5) + return -EINVAL; + } + + if (!(p.i_flags & GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags & GRE_KEY)) + p.o_key = 0; + + p.i_flags = VTI_ISVTI; + + err = ip_tunnel_ioctl(dev, &p, cmd); + if (err) + return err; + + if (cmd != SIOCDELTUNNEL) { + p.i_flags |= GRE_KEY; + p.o_flags |= GRE_KEY; + } + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; + return 0; +} + +static const struct net_device_ops vti_netdev_ops = { + .ndo_init = vti_tunnel_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = vti_tunnel_xmit, + .ndo_do_ioctl = vti_tunnel_ioctl, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void vti_tunnel_setup(struct net_device *dev) +{ + dev->netdev_ops = &vti_netdev_ops; + dev->type = ARPHRD_TUNNEL; + ip_tunnel_setup(dev, vti_net_id); +} + +static int vti_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + + memcpy(dev->dev_addr, &iph->saddr, 4); + memcpy(dev->broadcast, &iph->daddr, 4); + + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + dev->features |= NETIF_F_LLTX; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + + return ip_tunnel_init(dev); +} + +static void __net_init vti_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; +} + +static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static int __net_init vti_init_net(struct net *net) +{ + int err; + struct ip_tunnel_net *itn; + + err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); + if (err) + return err; + itn = net_generic(net, vti_net_id); + vti_fb_tunnel_init(itn->fb_tunnel_dev); + return 0; +} + +static void __net_exit vti_exit_net(struct net *net) +{ + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + ip_tunnel_delete_net(itn, &vti_link_ops); +} + +static struct pernet_operations vti_net_ops = { + .init = vti_init_net, + .exit = vti_exit_net, + .id = &vti_net_id, + .size = sizeof(struct ip_tunnel_net), +}; + +static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + return 0; +} + +static void vti_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.protocol = IPPROTO_IPIP; + + if (!data) + return; + + parms->i_flags = VTI_ISVTI; + + if (data[IFLA_VTI_LINK]) + parms->link = nla_get_u32(data[IFLA_VTI_LINK]); + + if (data[IFLA_VTI_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); + + if (data[IFLA_VTI_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); + + if (data[IFLA_VTI_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); + + if (data[IFLA_VTI_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); + +} + +static int vti_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ip_tunnel_parm parms; + + vti_netlink_parms(data, &parms); + return ip_tunnel_newlink(dev, tb, &parms); +} + +static int vti_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel_parm p; + + vti_netlink_parms(data, &p); + return ip_tunnel_changelink(dev, tb, &p); +} + +static size_t vti_get_size(const struct net_device *dev) +{ + return + /* IFLA_VTI_LINK */ + nla_total_size(4) + + /* IFLA_VTI_IKEY */ + nla_total_size(4) + + /* IFLA_VTI_OKEY */ + nla_total_size(4) + + /* IFLA_VTI_LOCAL */ + nla_total_size(4) + + /* IFLA_VTI_REMOTE */ + nla_total_size(4) + + 0; +} + +static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm *p = &t->parms; + + nla_put_u32(skb, IFLA_VTI_LINK, p->link); + nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); + nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); + nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); + nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); + + return 0; +} + +static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { + [IFLA_VTI_LINK] = { .type = NLA_U32 }, + [IFLA_VTI_IKEY] = { .type = NLA_U32 }, + [IFLA_VTI_OKEY] = { .type = NLA_U32 }, + [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, +}; + +static struct rtnl_link_ops vti_link_ops __read_mostly = { + .kind = "vti", + .maxtype = IFLA_VTI_MAX, + .policy = vti_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = vti_tunnel_setup, + .validate = vti_tunnel_validate, + .newlink = vti_newlink, + .changelink = vti_changelink, + .get_size = vti_get_size, + .fill_info = vti_fill_info, +}; + +static int __init vti_init(void) +{ + int err; + + pr_info("IPv4 over IPSec tunneling driver\n"); + + err = register_pernet_device(&vti_net_ops); + if (err < 0) + return err; + err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); + if (err < 0) { + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); + if (err < 0) { + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = rtnl_link_register(&vti_link_ops); + if (err < 0) + goto rtnl_link_failed; + + return err; + +rtnl_link_failed: + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + return err; +} + +static void __exit vti_fini(void) +{ + rtnl_link_unregister(&vti_link_ops); + if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) + pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) + pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) + pr_info("vti close: can't deregister tunnel\n"); + + + unregister_pernet_device(&vti_net_ops); +} + +module_init(vti_init); +module_exit(vti_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("vti"); +MODULE_ALIAS_NETDEV("ip_vti0"); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 63b64c45a82..c0855d50a3f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -23,7 +23,7 @@ #include <net/protocol.h> #include <net/sock.h> -static void ipcomp4_err(struct sk_buff *skb, u32 info) +static int ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; @@ -31,18 +31,29 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) - return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", - spi, &iph->daddr); + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); + + return 0; } /* We always hold one tunnel user reference to indicate a tunnel */ @@ -63,6 +74,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; + t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); if (xfrm_init_state(t)) @@ -137,6 +149,11 @@ out: return err; } +static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ipcomp_type = { .description = "IPCOMP4", .owner = THIS_MODULE, @@ -147,10 +164,12 @@ static const struct xfrm_type ipcomp_type = { .output = ipcomp_output }; -static const struct net_protocol ipcomp4_protocol = { +static struct xfrm4_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ipcomp4_rcv_cb, .err_handler = ipcomp4_err, - .no_policy = 1, + .priority = 0, }; static int __init ipcomp4_init(void) @@ -159,7 +178,7 @@ static int __init ipcomp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); return -EAGAIN; @@ -169,7 +188,7 @@ static int __init ipcomp4_init(void) static void __exit ipcomp4_fini(void) { - if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 67e8a6b086e..b3e86ea7b71 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -136,6 +136,8 @@ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ __be32 ic_gateway = NONE; /* Gateway IP address */ +__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */ + __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ @@ -204,7 +206,7 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; - unsigned long start; + unsigned long start, next_msg; last = &ic_first_dev; rtnl_lock(); @@ -261,12 +263,23 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; + next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + int wait, elapsed; + for_each_netdev(&init_net, dev) if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) goto have_carrier; msleep(1); + + if (time_before(jiffies, next_msg)) + continue; + + elapsed = jiffies_to_msecs(jiffies - start); + wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000; + pr_info("Waiting up to %d more seconds for network.\n", wait); + next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); } have_carrier: rtnl_unlock(); @@ -558,6 +571,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ic_myaddr == NONE) ic_myaddr = tip; ic_servaddr = sip; + ic_addrservaddr = sip; ic_got_reply = IC_RARP; drop_unlock: @@ -583,6 +597,17 @@ static void __init ic_rarp_send_if(struct ic_device *d) #endif /* + * Predefine Nameservers + */ +static inline void __init ic_nameservers_predef(void) +{ + int i; + + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) + ic_nameservers[i] = NONE; +} + +/* * DHCP/BOOTP support. */ @@ -747,10 +772,7 @@ static void __init ic_bootp_init_ext(u8 *e) */ static inline void __init ic_bootp_init(void) { - int i; - - for (i = 0; i < CONF_NAMESERVERS_MAX; i++) - ic_nameservers[i] = NONE; + ic_nameservers_predef(); dev_add_pack(&bootp_packet_type); } @@ -1060,7 +1082,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_servaddr = server_id; #ifdef IPCONFIG_DEBUG printk("DHCP: Offered address %pI4 by server %pI4\n", - &ic_myaddr, &ic_servaddr); + &ic_myaddr, &b->iph.saddr); #endif /* The DHCP indicated server address takes * precedence over the bootp header one if @@ -1105,6 +1127,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_dev = dev; ic_myaddr = b->your_ip; ic_servaddr = b->server_ip; + ic_addrservaddr = b->iph.saddr; if (ic_gateway == NONE && b->relay_ip) ic_gateway = b->relay_ip; if (ic_nameservers[0] == NONE) @@ -1260,7 +1283,7 @@ static int __init ic_dynamic(void) printk("IP-Config: Got %s answer from %pI4, ", ((ic_got_reply & IC_RARP) ? "RARP" : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), - &ic_servaddr); + &ic_addrservaddr); pr_cont("my address is %pI4\n", &ic_myaddr); return 0; @@ -1379,9 +1402,10 @@ static int __init ip_auto_config(void) int retries = CONF_OPEN_RETRIES; #endif int err; + unsigned int i; #ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); + proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) @@ -1491,14 +1515,25 @@ static int __init ip_auto_config(void) * Clue in the operator. */ pr_info("IP-Config: Complete:\n"); - pr_info(" device=%s, addr=%pI4, mask=%pI4, gw=%pI4\n", - ic_dev->name, &ic_myaddr, &ic_netmask, &ic_gateway); + + pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", + ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, + &ic_myaddr, &ic_netmask, &ic_gateway); pr_info(" host=%s, domain=%s, nis-domain=%s\n", utsname()->nodename, ic_domain, utsname()->domainname); pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s", &ic_servaddr, &root_server_addr, root_server_path); if (ic_dev_mtu) pr_cont(", mtu=%d", ic_dev_mtu); + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) + if (ic_nameservers[i] != NONE) { + pr_info(" nameserver%u=%pI4", + i, &ic_nameservers[i]); + break; + } + for (i++; i < CONF_NAMESERVERS_MAX; i++) + if (ic_nameservers[i] != NONE) + pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); pr_cont("\n"); #endif /* !SILENT */ @@ -1570,6 +1605,8 @@ static int __init ip_auto_config_setup(char *addrs) return 1; } + ic_nameservers_predef(); + /* Parse string for static IP assignment. */ ip = addrs; while (ip && *ip) { @@ -1613,6 +1650,20 @@ static int __init ip_auto_config_setup(char *addrs) ic_enable = 0; } break; + case 7: + if (CONF_NAMESERVERS_MAX >= 1) { + ic_nameservers[0] = in_aton(ip); + if (ic_nameservers[0] == ANY) + ic_nameservers[0] = NONE; + } + break; + case 8: + if (CONF_NAMESERVERS_MAX >= 2) { + ic_nameservers[1] = in_aton(ip); + if (ic_nameservers[1] == ANY) + ic_nameservers[1] = NONE; + } + break; } } ip = cp; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2d0f99bf61b..62eaa005e14 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -111,218 +111,20 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> -#define HASH_SIZE 16 -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int ipip_net_id __read_mostly; -struct ipip_net { - struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_wc[1]; - struct ip_tunnel __rcu **tunnels[4]; - - struct net_device *fb_tunnel_dev; -}; static int ipip_tunnel_init(struct net_device *dev); -static void ipip_tunnel_setup(struct net_device *dev); -static void ipip_dev_free(struct net_device *dev); - -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -#define for_each_ip_tunnel_rcu(start) \ - for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - -/* often modified stats are per cpu, other are shared (netdev->stats) */ -struct pcpu_tstats { - u64 rx_packets; - u64 rx_bytes; - u64 tx_packets; - u64 tx_bytes; - struct u64_stats_sync syncp; -}; - -static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - tot->collisions = dev->stats.collisions; - - return tot; -} - -static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, - __be32 remote, __be32 local) -{ - unsigned int h0 = HASH(remote); - unsigned int h1 = HASH(local); - struct ip_tunnel *t; - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) - if (local == t->parms.iph.saddr && - remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) - return t; - - for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) - if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) - return t; - - for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) - if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) - return t; - - t = rcu_dereference(ipn->tunnels_wc[0]); - if (t && (t->dev->flags&IFF_UP)) - return t; - return NULL; -} - -static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, - struct ip_tunnel_parm *parms) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - unsigned int h = 0; - int prio = 0; - - if (remote) { - prio |= 2; - h ^= HASH(remote); - } - if (local) { - prio |= 1; - h ^= HASH(local); - } - return &ipn->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, - struct ip_tunnel *t) -{ - return __ipip_bucket(ipn, &t->parms); -} - -static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp; - struct ip_tunnel *iter; - - for (tp = ipip_bucket(ipn, t); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; - } - } -} - -static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} - -static struct ip_tunnel *ipip_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - struct ip_tunnel *t, *nt; - struct ip_tunnel __rcu **tp; - struct net_device *dev; - char name[IFNAMSIZ]; - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - for (tp = __ipip_bucket(ipn, parms); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) - return t; - } - if (!create) - return NULL; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else - strcpy(name, "tunl%d"); - - dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); - if (dev == NULL) - return NULL; - - dev_net_set(dev, net); - - nt = netdev_priv(dev); - nt->parms = *parms; - - if (ipip_tunnel_init(dev) < 0) - goto failed_free; - - if (register_netdevice(dev) < 0) - goto failed_free; - - strcpy(nt->parms.name, dev->name); - - dev_hold(dev); - ipip_tunnel_link(ipn, nt); - return nt; - -failed_free: - ipip_dev_free(dev); - return NULL; -} - -/* called with RTNL */ -static void ipip_tunnel_uninit(struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - if (dev == ipn->fb_tunnel_dev) - RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL); - else - ipip_tunnel_unlink(ipn, netdev_priv(dev)); - dev_put(dev); -} +static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { @@ -331,45 +133,35 @@ static int ipip_err(struct sk_buff *skb, u32 info) 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + + err = -ENOENT; + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (t == NULL) + goto out; - switch (type) { - default: - case ICMP_PARAMETERPROB: - return 0; - - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: - /* Impossible event. */ - return 0; - case ICMP_FRAG_NEEDED: - /* Soft state for pmtu is maintained by IP core. */ - return 0; - default: - /* All others are translated to HOST_UNREACH. - rfc2003 contains "deep thoughts" about NET_UNREACH, - I believe they are just ether pollution. --ANK - */ - break; - } - break; - case ICMP_TIME_EXCEEDED: - if (code != ICMP_EXC_TTL) - return 0; - break; + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->parms.link, 0, IPPROTO_IPIP, 0); + err = 0; + goto out; } - err = -ENOENT; + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, + IPPROTO_IPIP, 0); + err = 0; + goto out; + } - rcu_read_lock(); - t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); - if (t == NULL || t->parms.iph.daddr == 0) + if (t->parms.iph.daddr == 0) goto out; err = 0; @@ -381,541 +173,310 @@ static int ipip_err(struct sk_buff *skb, u32 info) else t->err_count = 1; t->err_time = jiffies; + out: - rcu_read_unlock(); return err; } -static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, - struct sk_buff *skb) -{ - struct iphdr *inner_iph = ip_hdr(skb); - - if (INET_ECN_is_ce(outer_iph->tos)) - IP_ECN_set_ce(inner_iph); -} +static const struct tnl_ptk_info tpi = { + /* no tunnel info required for ipip. */ + .proto = htons(ETH_P_IP), +}; static int ipip_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); struct ip_tunnel *tunnel; - const struct iphdr *iph = ip_hdr(skb); - - rcu_read_lock(); - tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); - if (tunnel != NULL) { - struct pcpu_tstats *tstats; - - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - rcu_read_unlock(); - kfree_skb(skb); - return 0; - } - - secpath_reset(skb); - - skb->mac_header = skb->network_header; - skb_reset_network_header(skb); - skb->protocol = htons(ETH_P_IP); - skb->pkt_type = PACKET_HOST; - - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - __skb_tunnel_rx(skb, tunnel->dev); - - ipip_ecn_decapsulate(iph, skb); - - netif_rx(skb); + const struct iphdr *iph; - rcu_read_unlock(); - return 0; + iph = ip_hdr(skb); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); } - rcu_read_unlock(); return -1; + +drop: + kfree_skb(skb); + return 0; } /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ - static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct pcpu_tstats *tstats; const struct iphdr *tiph = &tunnel->parms.iph; - u8 tos = tunnel->parms.iph.tos; - __be16 df = tiph->frag_off; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - const struct iphdr *old_iph = ip_hdr(skb); - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - __be32 dst = tiph->daddr; - struct flowi4 fl4; - int mtu; - - if (skb->protocol != htons(ETH_P_IP)) - goto tx_error; - - if (tos & 1) - tos = old_iph->tos; - if (!dst) { - /* NBMA tunnel */ - if ((rt = skb_rtable(skb)) == NULL) { - dev->stats.tx_fifo_errors++; - goto tx_error; - } - dst = rt->rt_gateway; - } - - rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, - dst, tiph->saddr, - 0, 0, - IPPROTO_IPIP, RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; - } - tdev = rt->dst.dev; - - if (tdev == dev) { - ip_rt_put(rt); - dev->stats.collisions++; + if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - } - - df |= old_iph->frag_off & htons(IP_DF); - - if (df) { - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - - if (mtu < 68) { - dev->stats.collisions++; - ip_rt_put(rt); - goto tx_error; - } - - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - if ((old_iph->frag_off & htons(IP_DF)) && - mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - ip_rt_put(rt); - goto tx_error; - } - } - - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); - - if (skb_headroom(skb) < max_headroom || skb_shared(skb) || - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - dev->stats.tx_dropped++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - if (skb->sk) - skb_set_owner_w(new_skb, skb->sk); - dev_kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto out; - skb->transport_header = skb->network_header; - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - - /* - * Push down and install the IPIP header. - */ - - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; - - if ((iph->ttl = tiph->ttl) == 0) - iph->ttl = old_iph->ttl; - - nf_reset(skb); - tstats = this_cpu_ptr(dev->tstats); - __IPTUNNEL_XMIT(tstats, &dev->stats); + ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; -tx_error_icmp: - dst_link_failure(skb); tx_error: + kfree_skb(skb); +out: dev->stats.tx_errors++; - dev_kfree_skb(skb); return NETDEV_TX_OK; } -static void ipip_tunnel_bind_dev(struct net_device *dev) -{ - struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; - const struct iphdr *iph; - - tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; - - if (iph->daddr) { - struct rtable *rt; - struct flowi4 fl4; - - rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, - iph->daddr, iph->saddr, - 0, 0, - IPPROTO_IPIP, - RT_TOS(iph->tos), - tunnel->parms.link); - if (!IS_ERR(rt)) { - tdev = rt->dst.dev; - ip_rt_put(rt); - } - dev->flags |= IFF_POINTOPOINT; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); - - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); - } - dev->iflink = tunnel->parms.link; -} - static int -ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - switch (cmd) { - case SIOCGETTUNNEL: - t = NULL; - if (dev == ipn->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipip_tunnel_locate(net, &p, 0); - } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - break; - - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || - (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { - err = -EINVAL; - break; - } - t = netdev_priv(dev); - ipip_tunnel_unlink(ipn, t); - synchronize_net(); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - ipip_tunnel_link(ipn, t); - netdev_state_change(dev); - } - } - - if (t) { - err = 0; - if (cmd == SIOCCHGTUNNEL) { - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - t->parms.iph.frag_off = p.iph.frag_off; - if (t->parms.link != p.link) { - t->parms.link = p.link; - ipip_tunnel_bind_dev(dev); - netdev_state_change(dev); - } - } - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; - - case SIOCDELTUNNEL: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - goto done; - - if (dev == ipn->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) - goto done; - err = -EPERM; - if (t->dev == ipn->fb_tunnel_dev) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; - - default: - err = -EINVAL; + return -EINVAL; } -done: - return err; -} + p.i_key = p.o_key = p.i_flags = p.o_flags = 0; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + err = ip_tunnel_ioctl(dev, &p, cmd); + if (err) + return err; + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; -static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) - return -EINVAL; - dev->mtu = new_mtu; return 0; } static const struct net_device_ops ipip_netdev_ops = { - .ndo_uninit = ipip_tunnel_uninit, + .ndo_init = ipip_tunnel_init, + .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, - .ndo_change_mtu = ipip_tunnel_change_mtu, - .ndo_get_stats64 = ipip_get_stats64, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; -static void ipip_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); - free_netdev(dev); -} +#define IPIP_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; - dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + + dev->features |= IPIP_FEATURES; + dev->hw_features |= IPIP_FEATURES; + ip_tunnel_setup(dev, ipip_net_id); } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - tunnel->dev = dev; - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - ipip_tunnel_bind_dev(dev); - - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; + tunnel->hlen = 0; + tunnel->parms.iph.protocol = IPPROTO_IPIP; + return ip_tunnel_init(dev); } -static int __net_init ipip_fb_tunnel_init(struct net_device *dev) +static void ipip_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) { - struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *iph = &tunnel->parms.iph; - struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); + memset(parms, 0, sizeof(*parms)); - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + parms->iph.version = 4; + parms->iph.protocol = IPPROTO_IPIP; + parms->iph.ihl = 5; - iph->version = 4; - iph->protocol = IPPROTO_IPIP; - iph->ihl = 5; + if (!data) + return; - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; + if (data[IFLA_IPTUN_LINK]) + parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); - dev_hold(dev); - rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); - return 0; -} + if (data[IFLA_IPTUN_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); -static struct xfrm_tunnel ipip_handler __read_mostly = { - .handler = ipip_rcv, - .err_handler = ipip_err, - .priority = 1, -}; + if (data[IFLA_IPTUN_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); -static const char banner[] __initconst = - KERN_INFO "IPv4 over IPv4 tunneling driver\n"; + if (data[IFLA_IPTUN_TTL]) { + parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); + if (parms->iph.ttl) + parms->iph.frag_off = htons(IP_DF); + } + + if (data[IFLA_IPTUN_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + + if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); +} -static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) +static int ipip_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) { - int prio; - - for (prio = 1; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - - t = rtnl_dereference(ipn->tunnels[prio][h]); - while (t != NULL) { - unregister_netdevice_queue(t->dev, head); - t = rtnl_dereference(t->next); - } - } - } + struct ip_tunnel_parm p; + + ipip_netlink_parms(data, &p); + return ip_tunnel_newlink(dev, tb, &p); } -static int __net_init ipip_init_net(struct net *net) +static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) { - struct ipip_net *ipn = net_generic(net, ipip_net_id); - struct ip_tunnel *t; - int err; + struct ip_tunnel_parm p; - ipn->tunnels[0] = ipn->tunnels_wc; - ipn->tunnels[1] = ipn->tunnels_l; - ipn->tunnels[2] = ipn->tunnels_r; - ipn->tunnels[3] = ipn->tunnels_r_l; - - ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), - "tunl0", - ipip_tunnel_setup); - if (!ipn->fb_tunnel_dev) { - err = -ENOMEM; - goto err_alloc_dev; - } - dev_net_set(ipn->fb_tunnel_dev, net); + ipip_netlink_parms(data, &p); - err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); - if (err) - goto err_reg_dev; + if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) + return -EINVAL; - if ((err = register_netdev(ipn->fb_tunnel_dev))) - goto err_reg_dev; + return ip_tunnel_changelink(dev, tb, &p); +} - t = netdev_priv(ipn->fb_tunnel_dev); +static size_t ipip_get_size(const struct net_device *dev) +{ + return + /* IFLA_IPTUN_LINK */ + nla_total_size(4) + + /* IFLA_IPTUN_LOCAL */ + nla_total_size(4) + + /* IFLA_IPTUN_REMOTE */ + nla_total_size(4) + + /* IFLA_IPTUN_TTL */ + nla_total_size(1) + + /* IFLA_IPTUN_TOS */ + nla_total_size(1) + + /* IFLA_IPTUN_PMTUDISC */ + nla_total_size(1) + + 0; +} - strcpy(t->parms.name, ipn->fb_tunnel_dev->name); +static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || + nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || + nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || + nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, + !!(parm->iph.frag_off & htons(IP_DF)))) + goto nla_put_failure; return 0; -err_reg_dev: - ipip_dev_free(ipn->fb_tunnel_dev); -err_alloc_dev: - /* nothing */ - return err; +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { + [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, + [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, + [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, + [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, + [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, + [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ipip_link_ops __read_mostly = { + .kind = "ipip", + .maxtype = IFLA_IPTUN_MAX, + .policy = ipip_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipip_tunnel_setup, + .newlink = ipip_newlink, + .changelink = ipip_changelink, + .dellink = ip_tunnel_dellink, + .get_size = ipip_get_size, + .fill_info = ipip_fill_info, +}; + +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = ipip_rcv, + .err_handler = ipip_err, + .priority = 1, +}; + +static int __net_init ipip_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } static void __net_exit ipip_exit_net(struct net *net) { - struct ipip_net *ipn = net_generic(net, ipip_net_id); - LIST_HEAD(list); - - rtnl_lock(); - ipip_destroy_tunnels(ipn, &list); - unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + ip_tunnel_delete_net(itn, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit = ipip_exit_net, .id = &ipip_net_id, - .size = sizeof(struct ipip_net), + .size = sizeof(struct ip_tunnel_net), }; static int __init ipip_init(void) { int err; - printk(banner); + pr_info("ipip: IPv4 over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { - unregister_pernet_device(&ipip_net_ops); pr_info("%s: can't register tunnel\n", __func__); + goto xfrm_tunnel_failed; } + err = rtnl_link_register(&ipip_link_ops); + if (err < 0) + goto rtnl_link_failed; + +out: return err; + +rtnl_link_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel_failed: + unregister_pernet_device(&ipip_net_ops); + goto out; } static void __exit ipip_fini(void) { + rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); @@ -925,4 +486,5 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c94bbc6f2ba..65bcaa78904 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -61,10 +61,11 @@ #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> +#include <linux/netconf.h> #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) #define CONFIG_IP_PIMSM 1 @@ -83,8 +84,8 @@ struct mr_table { struct vif_device vif_table[MAXVIFS]; int maxvif; atomic_t cache_resolve_queue_len; - int mroute_do_assert; - int mroute_do_pim; + bool mroute_do_assert; + bool mroute_do_pim; #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) int mroute_reg_vif_num; #endif @@ -124,13 +125,18 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ipmr_new_table(struct net *net, u32 id); -static int ip_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *cache, - int local); +static void ipmr_free_table(struct mr_table *mrt); + +static void ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); +static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, + int cmd); +static void mroute_clean_tables(struct mr_table *mrt); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -151,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - struct ipmr_result res; - struct fib_lookup_arg arg = { .result = &res, }; int err; + struct ipmr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); @@ -218,7 +227,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, return 0; } -static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { +static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), @@ -271,7 +280,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - kfree(mrt); + ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } @@ -299,7 +308,7 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { - kfree(net->ipv4.mrt); + ipmr_free_table(net->ipv4.mrt); } #endif @@ -336,6 +345,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) return mrt; } +static void ipmr_free_table(struct mr_table *mrt) +{ + del_timer_sync(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) @@ -412,6 +428,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) goto failure; ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; if (dev_open(dev)) @@ -438,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_iif = skb->skb_iif, + .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; @@ -467,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev) dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; - dev->netdev_ops = ®_vif_netdev_ops, + dev->netdev_ops = ®_vif_netdev_ops; dev->destructor = free_netdev; dev->features |= NETIF_F_NETNS_LOCAL; } @@ -504,6 +521,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; rcu_read_unlock(); @@ -524,8 +542,8 @@ failure: } #endif -/* - * Delete a VIF entry +/** + * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call */ @@ -572,6 +590,9 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; + inet_netconf_notify_devconf(dev_net(dev), + NETCONFA_MC_FORWARDING, + dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); } @@ -610,13 +631,13 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - e = NLMSG_DATA(nlh); + e = nlmsg_data(nlh); e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); - rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { kfree_skb(skb); } @@ -655,6 +676,7 @@ static void ipmr_expire_process(unsigned long arg) } list_del(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_destroy_unres(mrt, c); } @@ -762,6 +784,8 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; + inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex, + &in_dev->cnf); ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ @@ -809,6 +833,49 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, return NULL; } +/* Look for a (*,*,oif) entry */ +static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, + int vifi) +{ + int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); + struct mfc_cache *c; + + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY) && + c->mfc_un.res.ttls[vifi] < 255) + return c; + + return NULL; +} + +/* Look for a (*,G) entry */ +static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, + __be32 mcastgrp, int vifi) +{ + int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); + struct mfc_cache *c, *proxy; + + if (mcastgrp == htonl(INADDR_ANY)) + goto skip; + + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == mcastgrp) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = ipmr_cache_find_any_parent(mrt, + c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } + +skip: + return ipmr_cache_find_any_parent(mrt, vifi); +} + /* * Allocate a multicast cache entry */ @@ -848,19 +915,19 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); - if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - e = NLMSG_DATA(nlh); + e = nlmsg_data(nlh); e->error = -EMSGSIZE; memset(&e->msg, 0, sizeof(e->msg)); } - rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { ip_mr_forward(net, mrt, skb, c, 0); } @@ -918,7 +985,7 @@ static int ipmr_cache_report(struct mr_table *mrt, /* Copy the IP header */ - skb->network_header = skb->tail; + skb_set_network_header(skb, skb->len); skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ @@ -1010,6 +1077,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->list, &mrt->mfc_unres_queue); + mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); @@ -1033,7 +1101,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) * MFC cache manipulation by user space mroute daemon */ -static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) +static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { int line; struct mfc_cache *c, *next; @@ -1042,9 +1110,10 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && + (parent == -1 || parent == c->mfc_parent)) { list_del_rcu(&c->list); - + mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_free(c); return 0; } @@ -1053,7 +1122,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, - struct mfcctl *mfc, int mrtsock) + struct mfcctl *mfc, int mrtsock, int parent) { bool found = false; int line; @@ -1066,7 +1135,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && + (parent == -1 || parent == c->mfc_parent)) { found = true; break; } @@ -1079,10 +1149,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } - if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) + if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) && + !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c = ipmr_cache_alloc(); @@ -1121,6 +1193,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1149,6 +1222,7 @@ static void mroute_clean_tables(struct mr_table *mrt) if (c->mfc_flags & MFC_STATIC) continue; list_del_rcu(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_free(c); } } @@ -1157,6 +1231,7 @@ static void mroute_clean_tables(struct mr_table *mrt) spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { list_del(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_destroy_unres(mrt, c); } spin_unlock_bh(&mfc_unres_lock); @@ -1175,6 +1250,9 @@ static void mrtsock_destruct(struct sock *sk) ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; + inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); mroute_clean_tables(mrt); } @@ -1191,29 +1269,30 @@ static void mrtsock_destruct(struct sock *sk) int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) { - int ret; + int ret, parent = 0; struct vifctl vif; struct mfcctl mfc; struct net *net = sock_net(sk); struct mr_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_IGMP) + return -EOPNOTSUPP; + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (mrt == NULL) return -ENOENT; if (optname != MRT_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && - !capable(CAP_NET_ADMIN)) + !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT_INIT: - if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->inet_num != IPPROTO_IGMP) - return -EOPNOTSUPP; if (optlen != sizeof(int)) - return -ENOPROTOOPT; + return -EINVAL; rtnl_lock(); if (rtnl_dereference(mrt->mroute_sk)) { @@ -1225,6 +1304,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (ret == 0) { rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; + inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all); } rtnl_unlock(); return ret; @@ -1256,16 +1338,22 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi */ case MRT_ADD_MFC: case MRT_DEL_MFC: + parent = -1; + case MRT_ADD_MFC_PROXY: + case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) return -EINVAL; if (copy_from_user(&mfc, optval, sizeof(mfc))) return -EFAULT; + if (parent == 0) + parent = mfc.mfcc_parent; rtnl_lock(); - if (optname == MRT_DEL_MFC) - ret = ipmr_mfc_delete(mrt, &mfc); + if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) + ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, - sk == rtnl_dereference(mrt->mroute_sk)); + sk == rtnl_dereference(mrt->mroute_sk), + parent); rtnl_unlock(); return ret; /* @@ -1274,9 +1362,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi case MRT_ASSERT: { int v; + if (optlen != sizeof(v)) + return -EINVAL; if (get_user(v, (int __user *)optval)) return -EFAULT; - mrt->mroute_do_assert = (v) ? 1 : 0; + mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IP_PIMSM @@ -1284,9 +1374,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi { int v; + if (optlen != sizeof(v)) + return -EINVAL; if (get_user(v, (int __user *)optval)) return -EFAULT; - v = (v) ? 1 : 0; + v = !!v; rtnl_lock(); ret = 0; @@ -1308,6 +1400,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (get_user(v, (u32 __user *)optval)) return -EFAULT; + /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ + if (v != RT_TABLE_DEFAULT && v >= 1000000000) + return -EINVAL; + rtnl_lock(); ret = 0; if (sk == rtnl_dereference(mrt->mroute_sk)) { @@ -1315,7 +1411,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi } else { if (!ipmr_new_table(net, v)) ret = -ENOMEM; - raw_sk(sk)->ipmr_table = v; + else + raw_sk(sk)->ipmr_table = v; } rtnl_unlock(); return ret; @@ -1341,6 +1438,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int struct net *net = sock_net(sk); struct mr_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_IGMP) + return -EOPNOTSUPP; + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (mrt == NULL) return -ENOENT; @@ -1513,7 +1614,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; @@ -1562,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(iph, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1699,23 +1800,34 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ -static int ip_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *cache, - int local) +static void ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local) { int psend = -1; int vif, ct; + int true_vifi = ipmr_find_vif(mrt, skb->dev); vif = cache->mfc_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { + struct mfc_cache *cache_proxy; + + /* For an (*,G) entry, we only check that the incomming + * interface is part of the static tree. + */ + cache_proxy = ipmr_cache_find_any_parent(mrt, vif); + if (cache_proxy && + cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + goto forward; + } + /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { - int true_vifi; - if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -1732,7 +1844,6 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, } cache->mfc_un.res.wrong_if++; - true_vifi = ipmr_find_vif(mrt, skb->dev); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -1750,15 +1861,34 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } +forward: mrt->vif_table[vif].pkt_in++; mrt->vif_table[vif].bytes_in += skb->len; /* * Forward the frame */ + if (cache->mfc_origin == htonl(INADDR_ANY) && + cache->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (true_vifi >= 0 && + true_vifi != cache->mfc_parent && + ip_hdr(skb)->ttl > + cache->mfc_un.res.ttls[cache->mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = cache->mfc_parent; + goto last_forward; + } + goto dont_forward; + } for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { - if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + /* For (*,G) entry, don't forward to the incoming interface */ + if ((cache->mfc_origin != htonl(INADDR_ANY) || + ct != true_vifi) && + ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -1769,6 +1899,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, psend = ct; } } +last_forward: if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -1777,14 +1908,13 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, ipmr_queue_xmit(net, mrt, skb2, cache, psend); } else { ipmr_queue_xmit(net, mrt, skb, cache, psend); - return 0; + return; } } dont_forward: if (!local) kfree_skb(skb); - return 0; } static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) @@ -1795,9 +1925,12 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = RT_TOS(iph->tos), - .flowi4_oif = rt->rt_oif, - .flowi4_iif = rt->rt_iif, - .flowi4_mark = rt->rt_mark, + .flowi4_oif = (rt_is_output_route(rt) ? + skb->dev->ifindex : 0), + .flowi4_iif = (rt_is_output_route(rt) ? + LOOPBACK_IFINDEX : + skb->dev->ifindex), + .flowi4_mark = skb->mark, }; struct mr_table *mrt; int err; @@ -1855,6 +1988,13 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + if (cache == NULL) { + int vif = ipmr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, + vif); + } /* * No usable cache entry @@ -1932,9 +2072,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->ip_summed = CHECKSUM_NONE; - skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, reg_dev); + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); @@ -2006,37 +2145,44 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, { int ct; struct rtnexthop *nhp; - u8 *b = skb_tail_pointer(skb); - struct rtattr *mp_head; + struct nlattr *mp_attr; + struct rta_mfc_stats mfcs; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) return -ENOENT; - if (VIF_EXISTS(mrt, c->mfc_parent)) - RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); + if (VIF_EXISTS(mrt, c->mfc_parent) && + nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + return -EMSGSIZE; - mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) + return -EMSGSIZE; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) - goto rtattr_failure; - nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } - mp_head->rta_type = RTA_MULTIPATH; - mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + + nla_nest_end(skb, mp_attr); + + mfcs.mfcs_packets = c->mfc_un.res.pkt; + mfcs.mfcs_bytes = c->mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) + return -EMSGSIZE; + rtm->rtm_type = RTN_MULTICAST; return 1; - -rtattr_failure: - nlmsg_trim(skb, b); - return -EMSGSIZE; } int ipmr_get_route(struct net *net, struct sk_buff *skb, @@ -2053,7 +2199,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, rcu_read_lock(); cache = ipmr_cache_find(mrt, saddr, daddr); + if (cache == NULL && skb->dev) { + int vif = ipmr_find_vif(mrt, skb->dev); + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, daddr, vif); + } if (cache == NULL) { struct sk_buff *skb2; struct iphdr *iph; @@ -2104,12 +2255,14 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - u32 pid, u32 seq, struct mfc_cache *c) + u32 portid, u32 seq, struct mfc_cache *c, int cmd, + int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; + int err; - nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); + nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2123,13 +2276,18 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_protocol = RTPROT_UNSPEC; + if (c->mfc_flags & MFC_STATIC) + rtm->rtm_protocol = RTPROT_STATIC; + else + rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; - if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) + err = __ipmr_fill_mroute(mrt, skb, c, rtm); + /* do not break the dump if cache is unresolved */ + if (err < 0 && err != -ENOENT) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -2139,6 +2297,52 @@ nla_put_failure: return -EMSGSIZE; } +static size_t mroute_msgsize(bool unresolved, int maxvif) +{ + size_t len = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(4) /* RTA_SRC */ + + nla_total_size(4) /* RTA_DST */ + ; + + if (!unresolved) + len = len + + nla_total_size(4) /* RTA_IIF */ + + nla_total_size(0) /* RTA_MULTIPATH */ + + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) + /* RTA_MFC_STATS */ + + nla_total_size(sizeof(struct rta_mfc_stats)) + ; + + return len; +} + +static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, + int cmd) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), + GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); + if (err < 0) + goto errout; + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC); + return; + +errout: + kfree_skb(skb); + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); +} + static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -2163,15 +2367,33 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (e < s_e) goto next_entry; if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc) < 0) + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) goto done; next_entry: e++; } e = s_e = 0; } + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (ipmr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) { + spin_unlock_bh(&mfc_unres_lock); + goto done; + } +next_entry2: + e++; + } + spin_unlock_bh(&mfc_unres_lock); + e = s_e = 0; s_h = 0; next_table: t++; @@ -2487,16 +2709,16 @@ static int __net_init ipmr_net_init(struct net *net) #ifdef CONFIG_PROC_FS err = -ENOMEM; - if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops)) + if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops)) goto proc_vif_fail; - if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops)) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: - proc_net_remove(net, "ip_mr_vif"); + remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: ipmr_rules_exit(net); #endif @@ -2507,8 +2729,8 @@ fail: static void __net_exit ipmr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "ip_mr_cache"); - proc_net_remove(net, "ip_mr_vif"); + remove_proc_entry("ip_mr_cache", net->proc_net); + remove_proc_entry("ip_mr_vif", net->proc_net); #endif ipmr_rules_exit(net); } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index ed1b3678319..7ebd6e37875 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -1,4 +1,9 @@ -/* IPv4 specific functions of netfilter core */ +/* + * IPv4 specific functions of netfilter core + * + * Rusty Russell (C) 2000 -- This code is GPL. + * Patrick McHardy (C) 2006-2012 + */ #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -40,14 +45,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) - return -1; + return PTR_ERR(rt); /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); if (skb_dst(skb)->error) - return -1; + return skb_dst(skb)->error; #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -56,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); if (IS_ERR(dst)) - return -1; + return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif @@ -66,49 +71,12 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) - return -1; + return -ENOMEM; return 0; } EXPORT_SYMBOL(ip_route_me_harder); -#ifdef CONFIG_XFRM -int ip_xfrm_me_harder(struct sk_buff *skb) -{ - struct flowi fl; - unsigned int hh_len; - struct dst_entry *dst; - - if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) - return 0; - if (xfrm_decode_session(skb, &fl, AF_INET) < 0) - return -1; - - dst = skb_dst(skb); - if (dst->xfrm) - dst = ((struct xfrm_dst *)dst)->route; - dst_hold(dst); - - dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); - if (IS_ERR(dst)) - return -1; - - skb_dst_drop(skb); - skb_dst_set(skb, dst); - - /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len && - pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) - return -1; - return 0; -} -EXPORT_SYMBOL(ip_xfrm_me_harder); -#endif - -void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); -EXPORT_SYMBOL(ip_nat_decode_session); - /* * Extra routing may needed on local out, as the QUEUE target never * returns control to the table. @@ -225,12 +193,12 @@ static const struct nf_afinfo nf_ip_afinfo = { .route_key_size = sizeof(struct ip_rt_info), }; -static int ipv4_netfilter_init(void) +static int __init ipv4_netfilter_init(void) { return nf_register_afinfo(&nf_ip_afinfo); } -static void ipv4_netfilter_fini(void) +static void __exit ipv4_netfilter_fini(void) { nf_unregister_afinfo(&nf_ip_afinfo); } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fcc543cd987..a26ce035e3f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,18 +36,41 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config IP_NF_QUEUE - tristate "IP Userspace queueing via NETLINK (OBSOLETE)" - depends on NETFILTER_ADVANCED +config NF_TABLES_IPV4 + depends on NF_TABLES + tristate "IPv4 nf_tables support" help - Netfilter has the ability to queue packets to user space: the - netlink device can be used to access them using this driver. + This option enables the IPv4 support for nf_tables. - This option enables the old IPv4-only "ip_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). +config NFT_CHAIN_ROUTE_IPV4 + depends on NF_TABLES_IPV4 + tristate "IPv4 nf_tables route chain support" + help + This option enables the "route" chain for IPv4 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, type of service and + the packet mark. + +config NFT_CHAIN_NAT_IPV4 + depends on NF_TABLES_IPV4 + depends on NF_NAT_IPV4 && NFT_NAT + tristate "IPv4 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv4 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + default NFT_REJECT + tristate - To compile it as a module, choose M here. If unsure, say N. +config NF_TABLES_ARP + depends on NF_TABLES + tristate "ARP nf_tables support" + help + This option enables the ARP support for nf_tables. config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" @@ -84,7 +107,7 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -123,8 +146,21 @@ config IP_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_ULOG - tristate "ULOG target support" + tristate "ULOG target support (obsolete)" default m if NETFILTER_ADVANCED=n ---help--- @@ -143,25 +179,22 @@ config IP_NF_TARGET_ULOG To compile it as a module, choose M here. If unsure, say N. # NAT + specific targets: nf_conntrack -config NF_NAT - tristate "Full NAT" +config NF_NAT_IPV4 + tristate "IPv4 NAT" depends on NF_CONNTRACK_IPV4 default m if NETFILTER_ADVANCED=n + select NF_NAT help - The Full NAT option allows masquerading, port forwarding and other + The IPv4 NAT option allows masquerading, port forwarding and other forms of full Network Address Port Translation. It is controlled by the `nat' table in iptables: see the man page for iptables(8). To compile it as a module, choose M here. If unsure, say N. -config NF_NAT_NEEDED - bool - depends on NF_NAT - default y +if NF_NAT_IPV4 config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - depends on NF_NAT default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are @@ -174,30 +207,27 @@ config IP_NF_TARGET_MASQUERADE config IP_NF_TARGET_NETMAP tristate "NETMAP target support" - depends on NF_NAT depends on NETFILTER_ADVANCED - help - NETMAP is an implementation of static 1:1 NAT mapping of network - addresses. It maps the network address part, while keeping the host - address part intact. - - To compile it as a module, choose M here. If unsure, say N. + select NETFILTER_XT_TARGET_NETMAP + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_NETMAP. config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" - depends on NF_NAT depends on NETFILTER_ADVANCED - help - REDIRECT is a special case of NAT: all incoming connections are - mapped onto the incoming interface's address, causing the packets to - come to the local machine instead of passing through. This is - useful for transparent proxies. + select NETFILTER_XT_TARGET_REDIRECT + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_REDIRECT. - To compile it as a module, choose M here. If unsure, say N. +endif config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_CONNTRACK_SNMP && NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4 depends on NETFILTER_ADVANCED default NF_NAT && NF_CONNTRACK_SNMP ---help--- @@ -219,61 +249,21 @@ config NF_NAT_SNMP_BASIC # <expr> '&&' <expr> (6) # # (6) Returns the result of min(/expr/, /expr/). -config NF_NAT_PROTO_DCCP - tristate - depends on NF_NAT && NF_CT_PROTO_DCCP - default NF_NAT && NF_CT_PROTO_DCCP config NF_NAT_PROTO_GRE tristate - depends on NF_NAT && NF_CT_PROTO_GRE - -config NF_NAT_PROTO_UDPLITE - tristate - depends on NF_NAT && NF_CT_PROTO_UDPLITE - default NF_NAT && NF_CT_PROTO_UDPLITE - -config NF_NAT_PROTO_SCTP - tristate - default NF_NAT && NF_CT_PROTO_SCTP - depends on NF_NAT && NF_CT_PROTO_SCTP - select LIBCRC32C - -config NF_NAT_FTP - tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_FTP - -config NF_NAT_IRC - tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_IRC - -config NF_NAT_TFTP - tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_TFTP - -config NF_NAT_AMANDA - tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_AMANDA + depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE config NF_NAT_PPTP tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_PPTP + depends on NF_CONNTRACK && NF_NAT_IPV4 + default NF_NAT_IPV4 && NF_CONNTRACK_PPTP select NF_NAT_PROTO_GRE config NF_NAT_H323 tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_H323 - -config NF_NAT_SIP - tristate - depends on NF_CONNTRACK && NF_NAT - default NF_NAT && NF_CONNTRACK_SIP + depends on NF_CONNTRACK && NF_NAT_IPV4 + default NF_NAT_IPV4 && NF_CONNTRACK_H323 # mangle + specific targets config IP_NF_MANGLE @@ -287,8 +277,8 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_MANGLE && EXPERIMENTAL + tristate "CLUSTERIP target support" + depends on IP_NF_MANGLE depends on NF_CONNTRACK_IPV4 depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c20674dc945..90b82405331 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -10,32 +10,28 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o endif endif -nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o -iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o - # connection tracking obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o -obj-$(CONFIG_NF_NAT) += nf_nat.o +nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o # NAT helpers (nf_conntrack) -obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o -obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o -obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o -obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o # NAT protocols (nf_nat) -obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o -obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o + +obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o +obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o +obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o @@ -43,7 +39,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_NF_NAT) += iptable_nat.o +obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o @@ -55,9 +51,8 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o -obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o -obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97e61eadf58..f95b6f93814 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -6,6 +6,7 @@ * Some ARP specific bits are: * * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -270,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -901,7 +907,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -958,7 +964,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, } t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", @@ -1001,7 +1007,7 @@ static int __do_replace(struct net *net, const char *name, t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1038,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; @@ -1158,7 +1166,7 @@ static int do_add_counters(struct net *net, const void __user *user, } t = xt_find_table_lock(net, NFPROTO_ARP, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } @@ -1533,7 +1541,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1646,7 +1654,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1677,7 +1685,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1698,7 +1706,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1722,7 +1730,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 79ca5e70d49..802ddecb30b 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,14 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(unsigned int hook, struct sk_buff *skb, +arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net = dev_net((in != NULL) ? in : out); - return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); + return arpt_do_table(skb, ops->hooknum, in, out, + net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; @@ -48,9 +49,7 @@ static int __net_init arptable_filter_net_init(struct net *net) net->ipv4.arptable_filter = arpt_register_table(net, &packet_filter, repl); kfree(repl); - if (IS_ERR(net->ipv4.arptable_filter)) - return PTR_ERR(net->ipv4.arptable_filter); - return 0; + return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); } static void __net_exit arptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 170b1fdd6b7..99e810f8467 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -3,6 +3,7 @@ * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -182,8 +183,7 @@ ipt_get_target_c(const struct ipt_entry *e) return ipt_get_target((struct ipt_entry *)e); } -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", [NF_INET_LOCAL_IN] = "INPUT", @@ -259,6 +259,7 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; + struct net *net = dev_net(in ? in : out); table_base = private->entries[smp_processor_id()]; root = get_entry(table_base, private->hook_entry[hook]); @@ -271,7 +272,7 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, + nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } @@ -326,6 +327,11 @@ ipt_do_table(struct sk_buff *skb, addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); @@ -361,8 +367,7 @@ ipt_do_table(struct sk_buff *skb, t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(skb, hook, in, out, @@ -1090,7 +1095,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1149,7 +1154,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, } t = xt_find_table_lock(net, AF_INET, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) @@ -1189,7 +1194,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1226,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; @@ -1347,7 +1354,7 @@ do_add_counters(struct net *net, const void __user *user, } t = xt_find_table_lock(net, AF_INET, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } @@ -1846,7 +1853,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1931,7 +1938,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); @@ -1961,7 +1968,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1983,7 +1990,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2008,7 +2015,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index fe5daea5214..2510c02c2d2 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -28,6 +28,7 @@ #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/checksum.h> #include <net/ip.h> @@ -57,15 +58,21 @@ struct clusterip_config { struct rcu_head rcu; }; -static LIST_HEAD(clusterip_configs); +#ifdef CONFIG_PROC_FS +static const struct file_operations clusterip_proc_fops; +#endif -/* clusterip_lock protects the clusterip_configs list */ -static DEFINE_SPINLOCK(clusterip_lock); +static int clusterip_net_id __read_mostly; + +struct clusterip_net { + struct list_head configs; + /* lock protects the configs list */ + spinlock_t lock; #ifdef CONFIG_PROC_FS -static const struct file_operations clusterip_proc_fops; -static struct proc_dir_entry *clusterip_procdir; + struct proc_dir_entry *procdir; #endif +}; static inline void clusterip_config_get(struct clusterip_config *c) @@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c) static inline void clusterip_config_entry_put(struct clusterip_config *c) { + struct net *net = dev_net(c->dev); + struct clusterip_net *cn = net_generic(net, clusterip_net_id); + local_bh_disable(); - if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { + if (atomic_dec_and_lock(&c->entries, &cn->lock)) { list_del_rcu(&c->list); - spin_unlock(&clusterip_lock); + spin_unlock(&cn->lock); local_bh_enable(); dev_mc_del(c->dev, c->clustermac); @@ -105,7 +115,7 @@ clusterip_config_entry_put(struct clusterip_config *c) * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS - remove_proc_entry(c->pde->name, c->pde->parent); + proc_remove(c->pde); #endif return; } @@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c) } static struct clusterip_config * -__clusterip_config_find(__be32 clusterip) +__clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; + struct clusterip_net *cn = net_generic(net, clusterip_net_id); - list_for_each_entry_rcu(c, &clusterip_configs, list) { + list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) return c; } @@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip) } static inline struct clusterip_config * -clusterip_config_find_get(__be32 clusterip, int entry) +clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) { struct clusterip_config *c; rcu_read_lock_bh(); - c = __clusterip_config_find(clusterip); + c = __clusterip_config_find(net, clusterip); if (c) { if (unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; @@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, struct net_device *dev) { struct clusterip_config *c; + struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id); c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) @@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, - clusterip_procdir, + cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { kfree(c); @@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, } #endif - spin_lock_bh(&clusterip_lock); - list_add_rcu(&c->list, &clusterip_configs); - spin_unlock_bh(&clusterip_lock); + spin_lock_bh(&cn->lock); + list_add_rcu(&c->list, &cn->configs); + spin_unlock_bh(&cn->lock); return c; } @@ -370,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) /* FIXME: further sanity checks */ - config = clusterip_config_find_get(e->ip.dst.s_addr, 1); + config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); if (!config) { if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { pr_info("no config found for %pI4, need 'new'\n", @@ -384,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - dev = dev_get_by_name(&init_net, e->ip.iniface); + dev = dev_get_by_name(par->net, e->ip.iniface); if (!dev) { pr_info("no such interface %s\n", e->ip.iniface); @@ -483,7 +495,7 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(unsigned int hook, +arp_mangle(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -492,6 +504,7 @@ arp_mangle(unsigned int hook, struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; + struct net *net = dev_net(in ? in : out); /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || @@ -508,7 +521,7 @@ arp_mangle(unsigned int hook, /* if there is no clusterip configuration for the arp reply's * source ip, we don't want to mangle it */ - c = clusterip_config_find_get(payload->src_ip, 0); + c = clusterip_config_find_get(net, payload->src_ip, 0); if (!c) return NF_ACCEPT; @@ -631,7 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct clusterip_config *c = PDE(inode)->data; + struct clusterip_config *c = PDE_DATA(inode); sf->private = c; @@ -643,7 +656,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct clusterip_config *c = PDE(inode)->data; + struct clusterip_config *c = PDE_DATA(inode); int ret; ret = seq_release(inode, file); @@ -657,10 +670,11 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { - struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data; + struct clusterip_config *c = PDE_DATA(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; + int rc; if (size > PROC_WRITELEN) return -EIO; @@ -669,11 +683,15 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, buffer[size] = 0; if (*buffer == '+') { - nodenum = simple_strtoul(buffer+1, NULL, 10); + rc = kstrtoul(buffer+1, 10, &nodenum); + if (rc) + return rc; if (clusterip_add_node(c, nodenum)) return -ENOMEM; } else if (*buffer == '-') { - nodenum = simple_strtoul(buffer+1, NULL,10); + rc = kstrtoul(buffer+1, 10, &nodenum); + if (rc) + return rc; if (clusterip_del_node(c, nodenum)) return -ENOENT; } else @@ -693,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = { #endif /* CONFIG_PROC_FS */ +static int clusterip_net_init(struct net *net) +{ + struct clusterip_net *cn = net_generic(net, clusterip_net_id); + + INIT_LIST_HEAD(&cn->configs); + + spin_lock_init(&cn->lock); + +#ifdef CONFIG_PROC_FS + cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); + if (!cn->procdir) { + pr_err("Unable to proc dir entry\n"); + return -ENOMEM; + } +#endif /* CONFIG_PROC_FS */ + + return 0; +} + +static void clusterip_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + struct clusterip_net *cn = net_generic(net, clusterip_net_id); + proc_remove(cn->procdir); +#endif +} + +static struct pernet_operations clusterip_net_ops = { + .init = clusterip_net_init, + .exit = clusterip_net_exit, + .id = &clusterip_net_id, + .size = sizeof(struct clusterip_net), +}; + static int __init clusterip_tg_init(void) { int ret; - ret = xt_register_target(&clusterip_tg_reg); + ret = register_pernet_subsys(&clusterip_net_ops); if (ret < 0) return ret; + ret = xt_register_target(&clusterip_tg_reg); + if (ret < 0) + goto cleanup_subsys; + ret = nf_register_hook(&cip_arp_ops); if (ret < 0) goto cleanup_target; -#ifdef CONFIG_PROC_FS - clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); - if (!clusterip_procdir) { - pr_err("Unable to proc dir entry\n"); - ret = -ENOMEM; - goto cleanup_hook; - } -#endif /* CONFIG_PROC_FS */ - pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); + return 0; -#ifdef CONFIG_PROC_FS -cleanup_hook: - nf_unregister_hook(&cip_arp_ops); -#endif /* CONFIG_PROC_FS */ cleanup_target: xt_unregister_target(&clusterip_tg_reg); +cleanup_subsys: + unregister_pernet_subsys(&clusterip_net_ops); return ret; } static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); -#ifdef CONFIG_PROC_FS - remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); -#endif + nf_unregister_hook(&cip_arp_ops); xt_unregister_target(&clusterip_tg_reg); + unregister_pernet_subsys(&clusterip_net_ops); /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ rcu_barrier_bh(); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 2f210c79dc8..00352ce0f0d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -19,9 +19,9 @@ #include <net/ip.h> #include <net/checksum.h> #include <net/route.h> -#include <net/netfilter/nf_nat_rule.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -49,10 +49,10 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; - struct nf_nat_ipv4_range newrange; + struct nf_nat_range newrange; const struct nf_nat_ipv4_multi_range_compat *mr; const struct rtable *rt; - __be32 newsrc; + __be32 newsrc, nh; NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); @@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) mr = par->targinfo; rt = skb_rtable(skb); - newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + nh = rt_nexthop(rt, ip_hdr(skb)->daddr); + newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", par->out->name); return NF_DROP; @@ -79,10 +80,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) nat->masq_index = par->out->ifindex; /* Transfer from original range. */ - newrange = ((struct nf_nat_ipv4_range) - { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS, - newsrc, newsrc, - mr->range[0].min, mr->range[0].max }); + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newsrc; + newrange.max_addr.ip = newsrc; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); @@ -95,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex) if (!nat) return 0; - + if (nf_ct_l3num(i) != NFPROTO_IPV4) + return 0; return nat->masq_index == (int)(long)ifindex; } @@ -103,7 +108,7 @@ static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - const struct net_device *dev = ptr; + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { @@ -113,7 +118,7 @@ static int masq_device_event(struct notifier_block *this, NF_CT_ASSERT(dev->ifindex != 0); nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); } return NOTIFY_DONE; @@ -124,7 +129,10 @@ static int masq_inet_event(struct notifier_block *this, void *ptr) { struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - return masq_device_event(this, event, dev); + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return masq_device_event(this, event, &info); } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c deleted file mode 100644 index b5bfbbabf70..00000000000 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ /dev/null @@ -1,98 +0,0 @@ -/* NETMAP - static NAT mapping of IP network addresses (1:1). - * The mapping can be applied to source (POSTROUTING), - * destination (PREROUTING), or both (with separate rules). - */ - -/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat_rule.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); -MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); - -static int netmap_tg_check(const struct xt_tgchk_param *par) -{ - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) { - pr_debug("bad MAP_IPS.\n"); - return -EINVAL; - } - if (mr->rangesize != 1) { - pr_debug("bad rangesize %u.\n", mr->rangesize); - return -EINVAL; - } - return 0; -} - -static unsigned int -netmap_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 new_ip, netmask; - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_ipv4_range newrange; - - NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_POST_ROUTING || - par->hooknum == NF_INET_LOCAL_OUT || - par->hooknum == NF_INET_LOCAL_IN); - ct = nf_ct_get(skb, &ctinfo); - - netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_OUT) - new_ip = ip_hdr(skb)->daddr & ~netmask; - else - new_ip = ip_hdr(skb)->saddr & ~netmask; - new_ip |= mr->range[0].min_ip & netmask; - - newrange = ((struct nf_nat_ipv4_range) - { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS, - new_ip, new_ip, - mr->range[0].min, mr->range[0].max }); - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); -} - -static struct xt_target netmap_tg_reg __read_mostly = { - .name = "NETMAP", - .family = NFPROTO_IPV4, - .target = netmap_tg, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_LOCAL_IN), - .checkentry = netmap_tg_check, - .me = THIS_MODULE -}; - -static int __init netmap_tg_init(void) -{ - return xt_register_target(&netmap_tg_reg); -} - -static void __exit netmap_tg_exit(void) -{ - xt_unregister_target(&netmap_tg_reg); -} - -module_init(netmap_tg_init); -module_exit(netmap_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c deleted file mode 100644 index 7c0103a5203..00000000000 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ /dev/null @@ -1,110 +0,0 @@ -/* Redirect. Simple mapping which alters dst to a local IP address. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/timer.h> -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/netdevice.h> -#include <linux/if.h> -#include <linux/inetdevice.h> -#include <net/protocol.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat_rule.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); - -/* FIXME: Take multiple ranges --RR */ -static int redirect_tg_check(const struct xt_tgchk_param *par) -{ - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { - pr_debug("bad MAP_IPS.\n"); - return -EINVAL; - } - if (mr->rangesize != 1) { - pr_debug("bad rangesize %u.\n", mr->rangesize); - return -EINVAL; - } - return 0; -} - -static unsigned int -redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 newdst; - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_ipv4_range newrange; - - NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_OUT); - - ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - - /* Local packets: make them go to loopback */ - if (par->hooknum == NF_INET_LOCAL_OUT) - newdst = htonl(0x7F000001); - else { - struct in_device *indev; - struct in_ifaddr *ifa; - - newdst = 0; - - rcu_read_lock(); - indev = __in_dev_get_rcu(skb->dev); - if (indev && (ifa = indev->ifa_list)) - newdst = ifa->ifa_local; - rcu_read_unlock(); - - if (!newdst) - return NF_DROP; - } - - /* Transfer from original range. */ - newrange = ((struct nf_nat_ipv4_range) - { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS, - newdst, newdst, - mr->range[0].min, mr->range[0].max }); - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); -} - -static struct xt_target redirect_tg_reg __read_mostly = { - .name = "REDIRECT", - .family = NFPROTO_IPV4, - .target = redirect_tg, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), - .checkentry = redirect_tg_check, - .me = THIS_MODULE, -}; - -static int __init redirect_tg_init(void) -{ - return xt_register_target(&redirect_tg_reg); -} - -static void __exit redirect_tg_exit(void) -{ - xt_unregister_target(&redirect_tg_reg); -} - -module_init(redirect_tg_init); -module_exit(redirect_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 51f13f8ec72..5b6e0df4ccf 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -17,10 +17,6 @@ #include <linux/udp.h> #include <linux/icmp.h> #include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/route.h> -#include <net/dst.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_REJECT.h> @@ -28,108 +24,12 @@ #include <linux/netfilter_bridge.h> #endif +#include <net/netfilter/ipv4/nf_reject.h> + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); - if (oth == NULL) - return; - - /* No RST for RST. */ - if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; - - /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; - - skb_reserve(nskb, LL_MAX_HEADER); - - skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - niph->version = 4; - niph->ihl = sizeof(struct iphdr) / 4; - niph->tos = 0; - niph->id = 0; - niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; - niph->check = 0; - niph->saddr = oiph->daddr; - niph->daddr = oiph->saddr; - - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); - tcph->source = oth->dest; - tcph->dest = oth->source; - tcph->doff = sizeof(struct tcphdr) / 4; - - if (oth->ack) - tcph->seq = oth->ack_seq; - else { - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + - oldskb->len - ip_hdrlen(oldskb) - - (oth->doff << 2)); - tcph->ack = 1; - } - - tcph->rst = 1; - tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, - niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)tcph - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set_noref(nskb, skb_dst(oldskb)); - - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) - goto free_nskb; - - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - - /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) - goto free_nskb; - - nf_ct_attach(nskb, oldskb); - - ip_local_out(nskb); - return; - - free_nskb: - kfree_skb(nskb); -} - -static inline void send_unreach(struct sk_buff *skb_in, int code) -{ - icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); -} - static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -137,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - send_unreach(skb, ICMP_NET_UNREACH); + nf_send_unreach(skb, ICMP_NET_UNREACH); break; case IPT_ICMP_HOST_UNREACHABLE: - send_unreach(skb, ICMP_HOST_UNREACH); + nf_send_unreach(skb, ICMP_HOST_UNREACH); break; case IPT_ICMP_PROT_UNREACHABLE: - send_unreach(skb, ICMP_PROT_UNREACH); + nf_send_unreach(skb, ICMP_PROT_UNREACH); break; case IPT_ICMP_PORT_UNREACHABLE: - send_unreach(skb, ICMP_PORT_UNREACH); + nf_send_unreach(skb, ICMP_PORT_UNREACH); break; case IPT_ICMP_NET_PROHIBITED: - send_unreach(skb, ICMP_NET_ANO); + nf_send_unreach(skb, ICMP_NET_ANO); break; case IPT_ICMP_HOST_PROHIBITED: - send_unreach(skb, ICMP_HOST_ANO); + nf_send_unreach(skb, ICMP_HOST_ANO); break; case IPT_ICMP_ADMIN_PROHIBITED: - send_unreach(skb, ICMP_PKT_FILTERED); + nf_send_unreach(skb, ICMP_PKT_FILTERED); break; case IPT_TCP_RESET: - send_reset(skb, par->hooknum); + nf_send_reset(skb, par->hooknum); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 00000000000..a313c3fbeb4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct iphdr * +synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= XT_SYNPROXY_OPT_MSS; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + if (!synproxy_parse_options(skb, par->thoff, th, &opts)) + return NF_DROP; + + if (th->syn && !(th->ack || th->fin || th->rst)) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + return NF_DROP; + + } else if (th->ack && !(th->fin || th->rst || th->syn)) { + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + return NF_DROP; + } + + return XT_CONTINUE; +} + +static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.proto != IPPROTO_TCP || + e->ip.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg4_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV4, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), + .target = synproxy_tg4, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg4_check, + .destroy = synproxy_tg4_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg4_init(void) +{ + int err; + + err = nf_register_hooks(ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg4_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg4_exit(void) +{ + xt_unregister_target(&synproxy_tg4_reg); + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +} + +module_init(synproxy_tg4_init); +module_exit(synproxy_tg4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index ba5756d2016..9cb993cd224 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -4,6 +4,7 @@ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -37,7 +38,7 @@ #include <linux/skbuff.h> #include <linux/kernel.h> #include <linux/timer.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netdevice.h> #include <linux/mm.h> #include <linux/moduleparam.h> @@ -45,6 +46,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_ULOG.h> #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <linux/bitops.h> #include <asm/unaligned.h> @@ -78,20 +80,26 @@ typedef struct { struct timer_list timer; /* the timer function */ } ulog_buff_t; -static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ +static int ulog_net_id __read_mostly; +struct ulog_net { + unsigned int nlgroup[ULOG_MAXNLGROUPS]; + ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; + struct sock *nflognl; + spinlock_t lock; +}; -static struct sock *nflognl; /* our socket */ -static DEFINE_SPINLOCK(ulog_lock); /* spinlock */ +static struct ulog_net *ulog_pernet(struct net *net) +{ + return net_generic(net, ulog_net_id); +} /* send one ulog_buff_t to userspace */ -static void ulog_send(unsigned int nlgroupnum) +static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum) { - ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; + ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; - if (timer_pending(&ub->timer)) { - pr_debug("ulog_send: timer was pending, deleting\n"); - del_timer(&ub->timer); - } + pr_debug("ulog_send: timer is deleting\n"); + del_timer(&ub->timer); if (!ub->skb) { pr_debug("ulog_send: nothing to send\n"); @@ -105,7 +113,8 @@ static void ulog_send(unsigned int nlgroupnum) NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; pr_debug("throwing %d packets to netlink group %u\n", ub->qlen, nlgroupnum + 1); - netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); + netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, + GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; @@ -116,13 +125,17 @@ static void ulog_send(unsigned int nlgroupnum) /* timer function to flush queue in flushtimeout time */ static void ulog_timer(unsigned long data) { + unsigned int groupnum = *((unsigned int *)data); + struct ulog_net *ulog = container_of((void *)data, + struct ulog_net, + nlgroup[groupnum]); pr_debug("timer function called, calling ulog_send\n"); /* lock to protect against somebody modifying our structure * from ipt_ulog_target at the same time */ - spin_lock_bh(&ulog_lock); - ulog_send(data); - spin_unlock_bh(&ulog_lock); + spin_lock_bh(&ulog->lock); + ulog_send(ulog, groupnum); + spin_unlock_bh(&ulog->lock); } static struct sk_buff *ulog_alloc_skb(unsigned int size) @@ -150,7 +163,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ipt_ulog_packet(unsigned int hooknum, +static void ipt_ulog_packet(struct net *net, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -162,6 +176,7 @@ static void ipt_ulog_packet(unsigned int hooknum, size_t size, copy_len; struct nlmsghdr *nlh; struct timeval tv; + struct ulog_net *ulog = ulog_pernet(net); /* ffs == find first bit set, necessary because userspace * is already shifting groupnumber, but we need unshifted. @@ -174,11 +189,11 @@ static void ipt_ulog_packet(unsigned int hooknum, else copy_len = loginfo->copy_range; - size = NLMSG_SPACE(sizeof(*pm) + copy_len); + size = nlmsg_total_size(sizeof(*pm) + copy_len); - ub = &ulog_buffers[groupnum]; + ub = &ulog->ulog_buffers[groupnum]; - spin_lock_bh(&ulog_lock); + spin_lock_bh(&ulog->lock); if (!ub->skb) { if (!(ub->skb = ulog_alloc_skb(size))) @@ -188,7 +203,7 @@ static void ipt_ulog_packet(unsigned int hooknum, /* either the queue len is too high or we don't have * enough room in nlskb left. send it to userspace. */ - ulog_send(groupnum); + ulog_send(ulog, groupnum); if (!(ub->skb = ulog_alloc_skb(size))) goto alloc_failure; @@ -196,12 +211,16 @@ static void ipt_ulog_packet(unsigned int hooknum, pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); - /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ - nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, - sizeof(*pm)+copy_len); + nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, + sizeof(*pm)+copy_len, 0); + if (!nlh) { + pr_debug("error during nlmsg_put\n"); + goto out_unlock; + } ub->qlen++; - pm = NLMSG_DATA(nlh); + pm = nlmsg_data(nlh); + memset(pm, 0, sizeof(*pm)); /* We might not have a timestamp, get one */ if (skb->tstamp.tv64 == 0) @@ -214,12 +233,12 @@ static void ipt_ulog_packet(unsigned int hooknum, put_unaligned(tv.tv_usec, &pm->timestamp_usec); put_unaligned(skb->mark, &pm->mark); pm->hook = hooknum; - if (prefix != NULL) - strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + if (prefix != NULL) { + strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); + pm->prefix[sizeof(pm->prefix) - 1] = '\0'; + } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - else - *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 && skb->mac_header != skb->network_header && @@ -231,13 +250,9 @@ static void ipt_ulog_packet(unsigned int hooknum, if (in) strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - else - pm->indev_name[0] = '\0'; if (out) strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - else - pm->outdev_name[0] = '\0'; /* copy_len <= skb->len, so can't fail. */ if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) @@ -259,29 +274,30 @@ static void ipt_ulog_packet(unsigned int hooknum, if (ub->qlen >= loginfo->qthreshold) { if (loginfo->qthreshold > 1) nlh->nlmsg_type = NLMSG_DONE; - ulog_send(groupnum); + ulog_send(ulog, groupnum); } - - spin_unlock_bh(&ulog_lock); +out_unlock: + spin_unlock_bh(&ulog->lock); return; -nlmsg_failure: - pr_debug("error during NLMSG_PUT\n"); alloc_failure: pr_debug("Error building netlink message\n"); - spin_unlock_bh(&ulog_lock); + spin_unlock_bh(&ulog->lock); } static unsigned int ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) { - ipt_ulog_packet(par->hooknum, skb, par->in, par->out, + struct net *net = dev_net(par->in ? par->in : par->out); + + ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, par->targinfo, NULL); return XT_CONTINUE; } -static void ipt_logfn(u_int8_t pf, +static void ipt_logfn(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -303,13 +319,19 @@ static void ipt_logfn(u_int8_t pf, strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); } - ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); + ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); } static int ulog_tg_check(const struct xt_tgchk_param *par) { const struct ipt_ulog_info *loginfo = par->targinfo; + if (!par->net->xt.ulog_warn_deprecated) { + pr_info("ULOG is deprecated and it will be removed soon, " + "use NFLOG instead\n"); + par->net->xt.ulog_warn_deprecated = true; + } + if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { pr_debug("prefix not null-terminated\n"); return -EINVAL; @@ -377,57 +399,48 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { .me = THIS_MODULE, }; -static int __init ulog_tg_init(void) +static int __net_init ulog_tg_net_init(struct net *net) { - int ret, i; - - pr_debug("init module\n"); - - if (nlbufsiz > 128*1024) { - pr_warning("Netlink buffer has to be <= 128kB\n"); - return -EINVAL; - } + int i; + struct ulog_net *ulog = ulog_pernet(net); + struct netlink_kernel_cfg cfg = { + .groups = ULOG_MAXNLGROUPS, + }; + spin_lock_init(&ulog->lock); /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) - setup_timer(&ulog_buffers[i].timer, ulog_timer, i); + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + ulog->nlgroup[i] = i; + setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, + (unsigned long)&ulog->nlgroup[i]); + } - nflognl = netlink_kernel_create(&init_net, - NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, - NULL, THIS_MODULE); - if (!nflognl) + ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); + if (!ulog->nflognl) return -ENOMEM; - ret = xt_register_target(&ulog_tg_reg); - if (ret < 0) { - netlink_kernel_release(nflognl); - return ret; - } if (nflog) - nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); + nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger); return 0; } -static void __exit ulog_tg_exit(void) +static void __net_exit ulog_tg_net_exit(struct net *net) { ulog_buff_t *ub; int i; - - pr_debug("cleanup_module\n"); + struct ulog_net *ulog = ulog_pernet(net); if (nflog) - nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ulog_tg_reg); - netlink_kernel_release(nflognl); + nf_log_unset(net, &ipt_ulog_logger); + + netlink_kernel_release(ulog->nflognl); /* remove pending timers and free allocated skb's */ for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ub = &ulog_buffers[i]; - if (timer_pending(&ub->timer)) { - pr_debug("timer was pending, deleting\n"); - del_timer(&ub->timer); - } + ub = &ulog->ulog_buffers[i]; + pr_debug("timer is deleting\n"); + del_timer(&ub->timer); if (ub->skb) { kfree_skb(ub->skb); @@ -436,5 +449,50 @@ static void __exit ulog_tg_exit(void) } } +static struct pernet_operations ulog_tg_net_ops = { + .init = ulog_tg_net_init, + .exit = ulog_tg_net_exit, + .id = &ulog_net_id, + .size = sizeof(struct ulog_net), +}; + +static int __init ulog_tg_init(void) +{ + int ret; + pr_debug("init module\n"); + + if (nlbufsiz > 128*1024) { + pr_warn("Netlink buffer has to be <= 128kB\n"); + return -EINVAL; + } + + ret = register_pernet_subsys(&ulog_tg_net_ops); + if (ret) + goto out_pernet; + + ret = xt_register_target(&ulog_tg_reg); + if (ret < 0) + goto out_target; + + if (nflog) + nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); + + return 0; + +out_target: + unregister_pernet_subsys(&ulog_tg_net_ops); +out_pernet: + return ret; +} + +static void __exit ulog_tg_exit(void) +{ + pr_debug("cleanup_module\n"); + if (nflog) + nf_log_unregister(&ipt_ulog_logger); + xt_unregister_target(&ulog_tg_reg); + unregister_pernet_subsys(&ulog_tg_net_ops); +} + module_init(ulog_tg_init); module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 31371be8174..4bfaedf9b34 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -66,6 +66,12 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, return dev_match; } +static bool rpfilter_is_local(const struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + return rt && (rt->rt_flags & RTCF_LOCAL); +} + static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info; @@ -76,18 +82,15 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; - if (par->in->flags & IFF_LOOPBACK) + if (rpfilter_is_local(skb)) return true ^ invert; iph = ip_hdr(skb); if (ipv4_is_multicast(iph->daddr)) { if (ipv4_is_zeronet(iph->saddr)) return ipv4_is_local_multicast(iph->daddr) ^ invert; - flow.flowi4_iif = 0; - } else { - flow.flowi4_iif = dev_net(par->in)->loopback_dev->ifindex; } - + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 851acec852d..e08a74a243a 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,20 +33,21 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(unsigned int hook, struct sk_buff *skb, +iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (hook == NF_INET_LOCAL_OUT && + if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); + return ipt_do_table(skb, ops->hooknum, in, out, + net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; @@ -69,9 +70,7 @@ static int __net_init iptable_filter_net_init(struct net *net) net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, repl); kfree(repl); - if (IS_ERR(net->ipv4.iptable_filter)) - return PTR_ERR(net->ipv4.iptable_filter); - return 0; + return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); } static void __net_exit iptable_filter_net_exit(struct net *net) @@ -96,14 +95,10 @@ static int __init iptable_filter_init(void) filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); if (IS_ERR(filter_ops)) { ret = PTR_ERR(filter_ops); - goto cleanup_table; + unregister_pernet_subsys(&iptable_filter_net_ops); } return ret; - - cleanup_table: - unregister_pernet_subsys(&iptable_filter_net_ops); - return ret; } static void __exit iptable_filter_fini(void) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index aef5d1fbe77..6a5079c34bb 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -44,6 +44,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) u_int8_t tos; __be32 saddr, daddr; u_int32_t mark; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -66,9 +67,11 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; + iph->tos != tos) { + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } } return ret; @@ -76,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(unsigned int hook, +iptable_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (hook == NF_INET_LOCAL_OUT) + if (ops->hooknum == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, out); - if (hook == NF_INET_POST_ROUTING) - return ipt_do_table(skb, hook, in, out, + if (ops->hooknum == NF_INET_POST_ROUTING) + return ipt_do_table(skb, ops->hooknum, in, out, dev_net(out)->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, hook, in, out, + return ipt_do_table(skb, ops->hooknum, in, out, dev_net(in)->ipv4.iptable_mangle); } @@ -104,9 +107,7 @@ static int __net_init iptable_mangle_net_init(struct net *net) net->ipv4.iptable_mangle = ipt_register_table(net, &packet_mangler, repl); kfree(repl); - if (IS_ERR(net->ipv4.iptable_mangle)) - return PTR_ERR(net->ipv4.iptable_mangle); - return 0; + return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); } static void __net_exit iptable_mangle_net_exit(struct net *net) @@ -131,14 +132,10 @@ static int __init iptable_mangle_init(void) mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { ret = PTR_ERR(mangle_ops); - goto cleanup_table; + unregister_pernet_subsys(&iptable_mangle_net_ops); } return ret; - - cleanup_table: - unregister_pernet_subsys(&iptable_mangle_net_ops); - return ret; } static void __exit iptable_mangle_fini(void) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c new file mode 100644 index 00000000000..f1787c04a4d --- /dev/null +++ b/net/ipv4/netfilter/iptable_nat.c @@ -0,0 +1,328 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/ip.h> +#include <net/ip.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> + +static const struct xt_table nf_nat_ipv4_table = { + .name = "nat", + .valid_hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + .af = NFPROTO_IPV4, +}; + +static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + */ + struct nf_nat_range range; + + range.flags = 0; + pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, + HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); + + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); +} + +static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + unsigned int ret; + + ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); + if (ret == NF_ACCEPT) { + if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) + ret = alloc_null_binding(ct, hooknum); + } + return ret; +} + +static unsigned int +nf_nat_ipv4_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + /* maniptype == SRC for postrouting. */ + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + * and local-out, and nf_nat_out protects post-routing. + */ + NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + ops->hooknum)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} + +static unsigned int +nf_nat_ipv4_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + __be32 daddr = ip_hdr(skb)->daddr; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + daddr != ip_hdr(skb)->daddr) + skb_dst_drop(skb); + + return ret; +} + +static unsigned int +nf_nat_ipv4_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} + +static unsigned int +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} + +static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv4_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv4_out, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC, + }, + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv4_local_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv4_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC, + }, +}; + +static int __net_init iptable_nat_net_init(struct net *net) +{ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv4.nat_table); +} + +static void __net_exit iptable_nat_net_exit(struct net *net) +{ + ipt_unregister_table(net, net->ipv4.nat_table); +} + +static struct pernet_operations iptable_nat_net_ops = { + .init = iptable_nat_net_init, + .exit = iptable_nat_net_exit, +}; + +static int __init iptable_nat_init(void) +{ + int err; + + err = register_pernet_subsys(&iptable_nat_net_ops); + if (err < 0) + goto err1; + + err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + if (err < 0) + goto err2; + return 0; + +err2: + unregister_pernet_subsys(&iptable_nat_net_ops); +err1: + return err; +} + +static void __exit iptable_nat_exit(void) +{ + nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + unregister_pernet_subsys(&iptable_nat_net_ops); +} + +module_init(iptable_nat_init); +module_exit(iptable_nat_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 07fb710cd72..b2f7e8f9831 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,20 +20,20 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(unsigned int hook, struct sk_buff *skb, +iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (hook == NF_INET_LOCAL_OUT && + if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); + return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net) net->ipv4.iptable_raw = ipt_register_table(net, &packet_raw, repl); kfree(repl); - if (IS_ERR(net->ipv4.iptable_raw)) - return PTR_ERR(net->ipv4.iptable_raw); - return 0; + return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); } static void __net_exit iptable_raw_net_exit(struct net *net) @@ -75,14 +73,10 @@ static int __init iptable_raw_init(void) rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); if (IS_ERR(rawtable_ops)) { ret = PTR_ERR(rawtable_ops); - goto cleanup_table; + unregister_pernet_subsys(&iptable_raw_net_ops); } return ret; - - cleanup_table: - unregister_pernet_subsys(&iptable_raw_net_ops); - return ret; } static void __exit iptable_raw_fini(void) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index be45bdc4c60..c86647ed207 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,21 +37,22 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(unsigned int hook, struct sk_buff *skb, +iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (hook == NF_INET_LOCAL_OUT && + if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); + return ipt_do_table(skb, ops->hooknum, in, out, + net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -66,10 +67,7 @@ static int __net_init iptable_security_net_init(struct net *net) net->ipv4.iptable_security = ipt_register_table(net, &security_table, repl); kfree(repl); - if (IS_ERR(net->ipv4.iptable_security)) - return PTR_ERR(net->ipv4.iptable_security); - - return 0; + return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); } static void __net_exit iptable_security_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 91747d4ebc2..8127dc80286 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -1,6 +1,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -24,16 +25,12 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #include <net/netfilter/nf_log.h> -int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo); -EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); - static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple) { @@ -95,47 +92,52 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_confirm(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ipv4_helper(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; - unsigned int ret; /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; + return NF_ACCEPT; help = nfct_help(ct); if (!help) - goto out; + return NF_ACCEPT; /* rcu_read_lock()ed by nf_hook_slow */ helper = rcu_dereference(help->helper); if (!helper) - goto out; + return NF_ACCEPT; - ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); - if (ret != NF_ACCEPT) { - nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, - "nf_ct_%s: dropping packet", helper->name); - return ret; - } + return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); +} + +static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } @@ -145,16 +147,16 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(unsigned int hooknum, +static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb); } -static unsigned int ipv4_conntrack_local(unsigned int hooknum, +static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -164,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb); } /* Connection tracking may drop packets, but never alters them, so @@ -185,6 +187,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .priority = NF_IP_PRI_CONNTRACK, }, { + .hook = ipv4_helper, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, @@ -192,6 +201,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { + .hook = ipv4_helper, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, @@ -204,38 +220,33 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -static ctl_table ip_ct_sysctl_table[] = { +static struct ctl_table ip_ct_sysctl_table[] = { { .procname = "ip_conntrack_max", - .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_count", - .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_buckets", - .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_checksum", - .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_log_invalid", - .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -351,6 +362,25 @@ static struct nf_sockopt_ops so_getorigdst = { .owner = THIS_MODULE, }; +static int ipv4_init_net(struct net *net) +{ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + struct nf_ip_net *in = &net->ct.nf_ct_proto; + in->ctl_table = kmemdup(ip_ct_sysctl_table, + sizeof(ip_ct_sysctl_table), + GFP_KERNEL); + if (!in->ctl_table) + return -ENOMEM; + + in->ctl_table[0].data = &nf_conntrack_max; + in->ctl_table[1].data = &net->ct.count; + in->ctl_table[2].data = &net->ct.htable_size; + in->ctl_table[3].data = &net->ct.sysctl_checksum; + in->ctl_table[4].data = &net->ct.sysctl_log_invalid; +#endif + return 0; +} + struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .l3proto = PF_INET, .name = "ipv4", @@ -366,8 +396,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) .ctl_table_path = "net/ipv4/netfilter", - .ctl_table = ip_ct_sysctl_table, #endif + .init_net = ipv4_init_net, .me = THIS_MODULE, }; @@ -378,6 +408,54 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); +static int ipv4_net_init(struct net *net) +{ + int ret = 0; + + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4); + if (ret < 0) { + pr_err("nf_conntrack_tcp4: pernet registration failed\n"); + goto out_tcp; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4); + if (ret < 0) { + pr_err("nf_conntrack_udp4: pernet registration failed\n"); + goto out_udp; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp); + if (ret < 0) { + pr_err("nf_conntrack_icmp4: pernet registration failed\n"); + goto out_icmp; + } + ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: pernet registration failed\n"); + goto out_ipv4; + } + return 0; +out_ipv4: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); +out_icmp: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); +out_udp: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); +out_tcp: + return ret; +} + +static void ipv4_net_exit(struct net *net) +{ + nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); +} + +static struct pernet_operations ipv4_net_ops = { + .init = ipv4_net_init, + .exit = ipv4_net_exit, +}; + static int __init nf_conntrack_l3proto_ipv4_init(void) { int ret = 0; @@ -391,54 +469,63 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) return ret; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); + ret = register_pernet_subsys(&ipv4_net_ops); if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register tcp.\n"); + pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); goto cleanup_sockopt; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); + ret = nf_register_hooks(ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register udp.\n"); - goto cleanup_tcp; + pr_err("nf_conntrack_ipv4: can't register hooks.\n"); + goto cleanup_pernet; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4); if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register icmp.\n"); - goto cleanup_udp; + pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n"); + goto cleanup_hooks; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4); if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4\n"); - goto cleanup_icmp; + pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n"); + goto cleanup_tcp4; } - ret = nf_register_hooks(ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp); if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register hooks.\n"); - goto cleanup_ipv4; + pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n"); + goto cleanup_udp4; } + + ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); + goto cleanup_icmpv4; + } + #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) ret = nf_conntrack_ipv4_compat_init(); if (ret < 0) - goto cleanup_hooks; + goto cleanup_proto; #endif return ret; #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + cleanup_proto: + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); +#endif + cleanup_icmpv4: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); + cleanup_udp4: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); + cleanup_tcp4: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); cleanup_hooks: nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); -#endif - cleanup_ipv4: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - cleanup_icmp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + cleanup_pernet: + unregister_pernet_subsys(&ipv4_net_ops); cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst); return ret; @@ -450,19 +537,14 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) nf_conntrack_ipv4_compat_fini(); #endif + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + unregister_pernet_subsys(&ipv4_net_ops); nf_unregister_sockopt(&so_getorigdst); } module_init(nf_conntrack_l3proto_ipv4_init); module_exit(nf_conntrack_l3proto_ipv4_fini); - -void need_ipv4_conntrack(void) -{ - return; -} -EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 9682b36df38..4c48e434bb1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -2,6 +2,7 @@ * * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -417,12 +418,12 @@ static int __net_init ip_conntrack_net_init(struct net *net) { struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); + proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, - &ip_exp_file_ops); + proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net, + &ip_exp_file_ops); if (!proc_exp) goto err2; @@ -433,9 +434,9 @@ static int __net_init ip_conntrack_net_init(struct net *net) return 0; err3: - proc_net_remove(net, "ip_conntrack_expect"); + remove_proc_entry("ip_conntrack_expect", net->proc_net); err2: - proc_net_remove(net, "ip_conntrack"); + remove_proc_entry("ip_conntrack", net->proc_net); err1: return -ENOMEM; } @@ -443,8 +444,8 @@ err1: static void __net_exit ip_conntrack_net_exit(struct net *net) { remove_proc_entry("ip_conntrack", net->proc_net_stat); - proc_net_remove(net, "ip_conntrack_expect"); - proc_net_remove(net, "ip_conntrack"); + remove_proc_entry("ip_conntrack_expect", net->proc_net); + remove_proc_entry("ip_conntrack", net->proc_net); } static struct pernet_operations ip_conntrack_net_ops = { diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 0847e373d33..a338dad41b7 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,6 +24,11 @@ static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; +static inline struct nf_icmp_net *icmp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp; +} + static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { @@ -77,7 +83,7 @@ static int icmp_print_tuple(struct seq_file *s, static unsigned int *icmp_get_timeouts(struct net *net) { - return &nf_ct_icmp_timeout; + return &icmp_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ @@ -182,8 +188,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl, icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: short packet "); + nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, + NULL, "nf_ct_icmp: short packet "); return -NF_ACCEPT; } @@ -191,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; } @@ -204,7 +210,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, */ if (icmph->type > NR_ICMP_TYPES) { if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; } @@ -274,16 +280,18 @@ static int icmp_nlattr_tuple_size(void) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> -static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) { unsigned int *timeout = data; + struct nf_icmp_net *in = icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; } else { /* Set default ICMP timeout. */ - *timeout = nf_ct_icmp_timeout; + *timeout = in->timeout; } return 0; } @@ -308,11 +316,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ #ifdef CONFIG_SYSCTL -static struct ctl_table_header *icmp_sysctl_header; static struct ctl_table icmp_sysctl_table[] = { { .procname = "nf_conntrack_icmp_timeout", - .data = &nf_ct_icmp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -323,7 +329,6 @@ static struct ctl_table icmp_sysctl_table[] = { static struct ctl_table icmp_compat_sysctl_table[] = { { .procname = "ip_conntrack_icmp_timeout", - .data = &nf_ct_icmp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -333,6 +338,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = { #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ +static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmp_sysctl_table, + sizeof(icmp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, + sizeof(icmp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &in->timeout; +#endif +#endif + return 0; +} + +static int icmp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_icmp_net *in = icmp_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmp_timeout; + + ret = icmp_kmemdup_compat_sysctl_table(pn, in); + if (ret < 0) + return ret; + + ret = icmp_kmemdup_sysctl_table(pn, in); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + + return ret; +} + +static struct nf_proto_net *icmp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = { .l3proto = PF_INET, @@ -362,11 +423,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ -#ifdef CONFIG_SYSCTL - .ctl_table_header = &icmp_sysctl_header, - .ctl_table = icmp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = icmp_compat_sysctl_table, -#endif -#endif + .init_net = icmp_init_net, + .get_net_proto = icmp_get_net_proto, }; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9bb1b8a37a2..b8f6381c7d0 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) err = ip_defrag(skb, user); local_bh_enable(); - if (!err) + if (!err) { ip_send_check(ip_hdr(skb)); + skb->ignore_df = 1; + } return err; } @@ -60,7 +61,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, return IP_DEFRAG_CONNTRACK_OUT + zone; } -static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, +static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -83,7 +84,9 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { - enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); + enum ip_defrag_users user = + nf_ct_defrag_user(ops->hooknum, skb); + if (nf_ct_ipv4_gather_frags(skb, user)) return NF_STOLEN; } @@ -94,14 +97,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c deleted file mode 100644 index 7b22382ff0e..00000000000 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Amanda extension for TCP NAT alteration. - * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> - * based on a copy of HW's ip_nat_irc.c as well as other modules - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/udp.h> - -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_amanda.h> - -MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); -MODULE_DESCRIPTION("Amanda NAT helper"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_amanda"); - -static unsigned int help(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) -{ - char buffer[sizeof("65535")]; - u_int16_t port; - unsigned int ret; - - /* Connection comes from client. */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = IP_CT_DIR_ORIGINAL; - - /* When you see the packet, we need to NAT it the same as the - * this one (ie. same IP: it will be TCP and master is UDP). */ - exp->expectfn = nf_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int res; - - exp->tuple.dst.u.tcp.port = htons(port); - res = nf_ct_expect_related(exp); - if (res == 0) - break; - else if (res != -EBUSY) { - port = 0; - break; - } - } - - if (port == 0) - return NF_DROP; - - sprintf(buffer, "%u", port); - ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, - matchoff, matchlen, - buffer, strlen(buffer)); - if (ret != NF_ACCEPT) - nf_ct_unexpect_related(exp); - return ret; -} - -static void __exit nf_nat_amanda_fini(void) -{ - RCU_INIT_POINTER(nf_nat_amanda_hook, NULL); - synchronize_rcu(); -} - -static int __init nf_nat_amanda_init(void) -{ - BUG_ON(nf_nat_amanda_hook != NULL); - RCU_INIT_POINTER(nf_nat_amanda_hook, help); - return 0; -} - -module_init(nf_nat_amanda_init); -module_exit(nf_nat_amanda_fini); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c deleted file mode 100644 index abb52adf5ac..00000000000 --- a/net/ipv4/netfilter/nf_nat_core.c +++ /dev/null @@ -1,757 +0,0 @@ -/* NAT for netfilter; shared with compatibility layer. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/gfp.h> -#include <net/checksum.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> /* For tcp_prot in getorigdst */ -#include <linux/icmp.h> -#include <linux/udp.h> -#include <linux/jhash.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_zones.h> - -static DEFINE_SPINLOCK(nf_nat_lock); - -static struct nf_conntrack_l3proto *l3proto __read_mostly; - -#define MAX_IP_NAT_PROTO 256 -static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] - __read_mostly; - -static inline const struct nf_nat_protocol * -__nf_nat_proto_find(u_int8_t protonum) -{ - return rcu_dereference(nf_nat_protos[protonum]); -} - -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) -{ - unsigned int hash; - - /* Original src, to ensure we map it consistently if poss. */ - hash = jhash_3words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->src.u.all ^ zone, - tuple->dst.protonum, nf_conntrack_hash_rnd); - return ((u64)hash * net->ipv4.nat_htable_size) >> 32; -} - -/* Is this tuple already taken? (not by us) */ -int -nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack) -{ - /* Conntrack tracking doesn't keep track of outgoing tuples; only - incoming ones. NAT means they don't have a fixed mapping, - so we invert the tuple and look for the incoming reply. - - We could keep a separate hash if this proves too slow. */ - struct nf_conntrack_tuple reply; - - nf_ct_invert_tuplepr(&reply, tuple); - return nf_conntrack_tuple_taken(&reply, ignored_conntrack); -} -EXPORT_SYMBOL(nf_nat_used_tuple); - -/* If we source map this tuple so reply looks like reply_tuple, will - * that meet the constraints of range. */ -static int -in_range(const struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range) -{ - const struct nf_nat_protocol *proto; - int ret = 0; - - /* If we are supposed to map IPs, then we must be in the - range specified, otherwise let this drag us onto a new src IP. */ - if (range->flags & NF_NAT_RANGE_MAP_IPS) { - if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) || - ntohl(tuple->src.u3.ip) > ntohl(range->max_ip)) - return 0; - } - - rcu_read_lock(); - proto = __nf_nat_proto_find(tuple->dst.protonum); - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) || - proto->in_range(tuple, NF_NAT_MANIP_SRC, - &range->min, &range->max)) - ret = 1; - rcu_read_unlock(); - - return ret; -} - -static inline int -same_src(const struct nf_conn *ct, - const struct nf_conntrack_tuple *tuple) -{ - const struct nf_conntrack_tuple *t; - - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - return (t->dst.protonum == tuple->dst.protonum && - t->src.u3.ip == tuple->src.u3.ip && - t->src.u.all == tuple->src.u.all); -} - -/* Only called for SRC manip */ -static int -find_appropriate_src(struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_tuple *result, - const struct nf_nat_ipv4_range *range) -{ - unsigned int h = hash_by_src(net, zone, tuple); - const struct nf_conn_nat *nat; - const struct nf_conn *ct; - const struct hlist_node *n; - - rcu_read_lock(); - hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { - ct = nat->ct; - if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { - /* Copy source part from reply tuple. */ - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(result, range)) { - rcu_read_unlock(); - return 1; - } - } - } - rcu_read_unlock(); - return 0; -} - -/* For [FUTURE] fragmentation handling, we want the least-used - src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus - if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports - 1-65535, we don't do pro-rata allocation based on ports; we choose - the ip with the lowest src-ip/dst-ip/proto usage. -*/ -static void -find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - const struct nf_conn *ct, - enum nf_nat_manip_type maniptype) -{ - __be32 *var_ipp; - /* Host order */ - u_int32_t minip, maxip, j; - - /* No IP mapping? Do nothing. */ - if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) - return; - - if (maniptype == NF_NAT_MANIP_SRC) - var_ipp = &tuple->src.u3.ip; - else - var_ipp = &tuple->dst.u3.ip; - - /* Fast path: only one choice. */ - if (range->min_ip == range->max_ip) { - *var_ipp = range->min_ip; - return; - } - - /* Hashing source and destination IPs gives a fairly even - * spread in practice (if there are a small number of IPs - * involved, there usually aren't that many connections - * anyway). The consistency means that servers see the same - * client coming from the same IP (some Internet Banking sites - * like this), even across reboots. */ - minip = ntohl(range->min_ip); - maxip = ntohl(range->max_ip); - j = jhash_2words((__force u32)tuple->src.u3.ip, - range->flags & NF_NAT_RANGE_PERSISTENT ? - 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0); - j = ((u64)j * (maxip - minip + 1)) >> 32; - *var_ipp = htonl(minip + j); -} - -/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, - * we change the source to map into the range. For NF_INET_PRE_ROUTING - * and NF_INET_LOCAL_OUT, we change the destination to map into the - * range. It might not be possible to get a unique tuple, but we try. - * At worst (or if we race), we will end up with a final duplicate in - * __ip_conntrack_confirm and drop the packet. */ -static void -get_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig_tuple, - const struct nf_nat_ipv4_range *range, - struct nf_conn *ct, - enum nf_nat_manip_type maniptype) -{ - struct net *net = nf_ct_net(ct); - const struct nf_nat_protocol *proto; - u16 zone = nf_ct_zone(ct); - - /* 1) If this srcip/proto/src-proto-part is currently mapped, - and that same mapping gives a unique tuple within the given - range, use that. - - This is only required for source (ie. NAT/masq) mappings. - So far, we don't do local source mappings, so multiple - manips not an issue. */ - if (maniptype == NF_NAT_MANIP_SRC && - !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) { - /* try the original tuple first */ - if (in_range(orig_tuple, range)) { - if (!nf_nat_used_tuple(orig_tuple, ct)) { - *tuple = *orig_tuple; - return; - } - } else if (find_appropriate_src(net, zone, orig_tuple, tuple, - range)) { - pr_debug("get_unique_tuple: Found current src map\n"); - if (!nf_nat_used_tuple(tuple, ct)) - return; - } - } - - /* 2) Select the least-used IP/proto combination in the given - range. */ - *tuple = *orig_tuple; - find_best_ips_proto(zone, tuple, range, ct, maniptype); - - /* 3) The per-protocol part of the manip is made to map into - the range to make a unique tuple. */ - - rcu_read_lock(); - proto = __nf_nat_proto_find(orig_tuple->dst.protonum); - - /* Only bother mapping if it's not already in range and unique */ - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) { - if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { - if (proto->in_range(tuple, maniptype, &range->min, - &range->max) && - (range->min.all == range->max.all || - !nf_nat_used_tuple(tuple, ct))) - goto out; - } else if (!nf_nat_used_tuple(tuple, ct)) { - goto out; - } - } - - /* Last change: get protocol to try to obtain unique tuple. */ - proto->unique_tuple(tuple, range, maniptype, ct); -out: - rcu_read_unlock(); -} - -unsigned int -nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype) -{ - struct net *net = nf_ct_net(ct); - struct nf_conntrack_tuple curr_tuple, new_tuple; - struct nf_conn_nat *nat; - - /* nat helper or nfctnetlink also setup binding */ - nat = nfct_nat(ct); - if (!nat) { - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } - - NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || - maniptype == NF_NAT_MANIP_DST); - BUG_ON(nf_nat_initialized(ct, maniptype)); - - /* What we've got will look like inverse of reply. Normally - this is what is in the conntrack, except for prior - manipulations (future optimization: if num_manips == 0, - orig_tp = - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ - nf_ct_invert_tuplepr(&curr_tuple, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); - - if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { - struct nf_conntrack_tuple reply; - - /* Alter conntrack table so will recognize replies. */ - nf_ct_invert_tuplepr(&reply, &new_tuple); - nf_conntrack_alter_reply(ct, &reply); - - /* Non-atomic: we own this at the moment. */ - if (maniptype == NF_NAT_MANIP_SRC) - ct->status |= IPS_SRC_NAT; - else - ct->status |= IPS_DST_NAT; - } - - if (maniptype == NF_NAT_MANIP_SRC) { - unsigned int srchash; - - srchash = hash_by_src(net, nf_ct_zone(ct), - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); - /* nf_conntrack_alter_reply might re-allocate extension area */ - nat = nfct_nat(ct); - nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, - &net->ipv4.nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); - } - - /* It's done. */ - if (maniptype == NF_NAT_MANIP_DST) - ct->status |= IPS_DST_NAT_DONE; - else - ct->status |= IPS_SRC_NAT_DONE; - - return NF_ACCEPT; -} -EXPORT_SYMBOL(nf_nat_setup_info); - -/* Returns true if succeeded. */ -static bool -manip_pkt(u_int16_t proto, - struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *target, - enum nf_nat_manip_type maniptype) -{ - struct iphdr *iph; - const struct nf_nat_protocol *p; - - if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) - return false; - - iph = (void *)skb->data + iphdroff; - - /* Manipulate protcol part. */ - - /* rcu_read_lock()ed by nf_hook_slow */ - p = __nf_nat_proto_find(proto); - if (!p->manip_pkt(skb, iphdroff, target, maniptype)) - return false; - - iph = (void *)skb->data + iphdroff; - - if (maniptype == NF_NAT_MANIP_SRC) { - csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); - iph->saddr = target->src.u3.ip; - } else { - csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); - iph->daddr = target->dst.u3.ip; - } - return true; -} - -/* Do packet manipulations according to nf_nat_setup_info. */ -unsigned int nf_nat_packet(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, - struct sk_buff *skb) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned long statusbit; - enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); - - if (mtype == NF_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - /* Non-atomic: these bits don't change. */ - if (ct->status & statusbit) { - struct nf_conntrack_tuple target; - - /* We are aiming to look like inverse of other direction. */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - - if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype)) - return NF_DROP; - } - return NF_ACCEPT; -} -EXPORT_SYMBOL_GPL(nf_nat_packet); - -/* Dir is direction ICMP is coming from (opposite to packet it contains) */ -int nf_nat_icmp_reply_translation(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, - struct sk_buff *skb) -{ - struct { - struct icmphdr icmp; - struct iphdr ip; - } *inside; - struct nf_conntrack_tuple target; - int hdrlen = ip_hdrlen(skb); - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned long statusbit; - enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); - - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) - return 0; - - inside = (void *)skb->data + hdrlen; - - /* We're actually going to mangle it beyond trivial checksum - adjustment, so make sure the current checksum is correct. */ - if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) - return 0; - - /* Must be RELATED */ - NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || - skb->nfctinfo == IP_CT_RELATED_REPLY); - - /* Redirects on non-null nats must be dropped, else they'll - start talking to each other without our translation, and be - confused... --RR */ - if (inside->icmp.type == ICMP_REDIRECT) { - /* If NAT isn't finished, assume it and drop. */ - if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) - return 0; - - if (ct->status & IPS_NAT_MASK) - return 0; - } - - if (manip == NF_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (!(ct->status & statusbit)) - return 1; - - pr_debug("icmp_reply_translation: translating error %p manip %u " - "dir %s\n", skb, manip, - dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); - - /* Change inner back to look like incoming packet. We do the - opposite manip on this hook to normal, because it might not - pass all hooks (locally-generated ICMP). Consider incoming - packet: PREROUTING (DST manip), routing produces ICMP, goes - through POSTROUTING (which must correct the DST manip). */ - if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp), - &ct->tuplehash[!dir].tuple, !manip)) - return 0; - - if (skb->ip_summed != CHECKSUM_PARTIAL) { - /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)skb->data + hdrlen; - inside->icmp.checksum = 0; - inside->icmp.checksum = - csum_fold(skb_checksum(skb, hdrlen, - skb->len - hdrlen, 0)); - } - - /* Change outer to look the reply to an incoming packet - * (proto 0 means don't invert per-proto part). */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, skb, 0, &target, manip)) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); - -/* Protocol registration. */ -int nf_nat_protocol_register(const struct nf_nat_protocol *proto) -{ - int ret = 0; - - spin_lock_bh(&nf_nat_lock); - if (rcu_dereference_protected( - nf_nat_protos[proto->protonum], - lockdep_is_held(&nf_nat_lock) - ) != &nf_nat_unknown_protocol) { - ret = -EBUSY; - goto out; - } - RCU_INIT_POINTER(nf_nat_protos[proto->protonum], proto); - out: - spin_unlock_bh(&nf_nat_lock); - return ret; -} -EXPORT_SYMBOL(nf_nat_protocol_register); - -/* No one stores the protocol anywhere; simply delete it. */ -void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) -{ - spin_lock_bh(&nf_nat_lock); - RCU_INIT_POINTER(nf_nat_protos[proto->protonum], - &nf_nat_unknown_protocol); - spin_unlock_bh(&nf_nat_lock); - synchronize_rcu(); -} -EXPORT_SYMBOL(nf_nat_protocol_unregister); - -/* No one using conntrack by the time this called. */ -static void nf_nat_cleanup_conntrack(struct nf_conn *ct) -{ - struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - - if (nat == NULL || nat->ct == NULL) - return; - - NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); - spin_unlock_bh(&nf_nat_lock); -} - -static void nf_nat_move_storage(void *new, void *old) -{ - struct nf_conn_nat *new_nat = new; - struct nf_conn_nat *old_nat = old; - struct nf_conn *ct = old_nat->ct; - - if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) - return; - - spin_lock_bh(&nf_nat_lock); - hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); - spin_unlock_bh(&nf_nat_lock); -} - -static struct nf_ct_ext_type nat_extend __read_mostly = { - .len = sizeof(struct nf_conn_nat), - .align = __alignof__(struct nf_conn_nat), - .destroy = nf_nat_cleanup_conntrack, - .move = nf_nat_move_storage, - .id = NF_CT_EXT_NAT, - .flags = NF_CT_EXT_F_PREALLOC, -}; - -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { - [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, - [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, -}; - -static int nfnetlink_parse_nat_proto(struct nlattr *attr, - const struct nf_conn *ct, - struct nf_nat_ipv4_range *range) -{ - struct nlattr *tb[CTA_PROTONAT_MAX+1]; - const struct nf_nat_protocol *npt; - int err; - - err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); - if (err < 0) - return err; - - rcu_read_lock(); - npt = __nf_nat_proto_find(nf_ct_protonum(ct)); - if (npt->nlattr_to_range) - err = npt->nlattr_to_range(tb, range); - rcu_read_unlock(); - return err; -} - -static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { - [CTA_NAT_MINIP] = { .type = NLA_U32 }, - [CTA_NAT_MAXIP] = { .type = NLA_U32 }, - [CTA_NAT_PROTO] = { .type = NLA_NESTED }, -}; - -static int -nfnetlink_parse_nat(const struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_ipv4_range *range) -{ - struct nlattr *tb[CTA_NAT_MAX+1]; - int err; - - memset(range, 0, sizeof(*range)); - - err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); - if (err < 0) - return err; - - if (tb[CTA_NAT_MINIP]) - range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); - - if (!tb[CTA_NAT_MAXIP]) - range->max_ip = range->min_ip; - else - range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); - - if (range->min_ip) - range->flags |= NF_NAT_RANGE_MAP_IPS; - - if (!tb[CTA_NAT_PROTO]) - return 0; - - err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); - if (err < 0) - return err; - - return 0; -} - -static int -nfnetlink_parse_nat_setup(struct nf_conn *ct, - enum nf_nat_manip_type manip, - const struct nlattr *attr) -{ - struct nf_nat_ipv4_range range; - - if (nfnetlink_parse_nat(attr, ct, &range) < 0) - return -EINVAL; - if (nf_nat_initialized(ct, manip)) - return -EEXIST; - - return nf_nat_setup_info(ct, &range, manip); -} -#else -static int -nfnetlink_parse_nat_setup(struct nf_conn *ct, - enum nf_nat_manip_type manip, - const struct nlattr *attr) -{ - return -EOPNOTSUPP; -} -#endif - -static int __net_init nf_nat_net_init(struct net *net) -{ - /* Leave them the same for the moment. */ - net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); - if (!net->ipv4.nat_bysource) - return -ENOMEM; - return 0; -} - -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int clean_nat(struct nf_conn *i, void *data) -{ - struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - memset(nat, 0, sizeof(*nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); - return 0; -} - -static void __net_exit nf_nat_net_exit(struct net *net) -{ - nf_ct_iterate_cleanup(net, &clean_nat, NULL); - synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); -} - -static struct pernet_operations nf_nat_net_ops = { - .init = nf_nat_net_init, - .exit = nf_nat_net_exit, -}; - -static struct nf_ct_helper_expectfn follow_master_nat = { - .name = "nat-follow-master", - .expectfn = nf_nat_follow_master, -}; - -static int __init nf_nat_init(void) -{ - size_t i; - int ret; - - need_ipv4_conntrack(); - - ret = nf_ct_extend_register(&nat_extend); - if (ret < 0) { - printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); - return ret; - } - - ret = register_pernet_subsys(&nf_nat_net_ops); - if (ret < 0) - goto cleanup_extend; - - /* Sew in builtin protocols. */ - spin_lock_bh(&nf_nat_lock); - for (i = 0; i < MAX_IP_NAT_PROTO; i++) - RCU_INIT_POINTER(nf_nat_protos[i], &nf_nat_unknown_protocol); - RCU_INIT_POINTER(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); - RCU_INIT_POINTER(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); - RCU_INIT_POINTER(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); - spin_unlock_bh(&nf_nat_lock); - - /* Initialize fake conntrack so that NAT will skip it */ - nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - - l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); - - nf_ct_helper_expectfn_register(&follow_master_nat); - - BUG_ON(nf_nat_seq_adjust_hook != NULL); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); - BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); - RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, - nfnetlink_parse_nat_setup); - BUG_ON(nf_ct_nat_offset != NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); - return 0; - - cleanup_extend: - nf_ct_extend_unregister(&nat_extend); - return ret; -} - -static void __exit nf_nat_cleanup(void) -{ - unregister_pernet_subsys(&nf_nat_net_ops); - nf_ct_l3proto_put(l3proto); - nf_ct_extend_unregister(&nat_extend); - nf_ct_helper_expectfn_unregister(&follow_master_nat); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); - RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); - synchronize_net(); -} - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("nf-nat-ipv4"); - -module_init(nf_nat_init); -module_exit(nf_nat_cleanup); diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c deleted file mode 100644 index e462a957d08..00000000000 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ /dev/null @@ -1,137 +0,0 @@ -/* FTP extension for TCP NAT alteration. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_ftp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("ftp NAT helper"); -MODULE_ALIAS("ip_nat_ftp"); - -/* FIXME: Time out? --RR */ - -static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type, - char *buffer, size_t buflen, - __be32 addr, u16 port) -{ - switch (type) { - case NF_CT_FTP_PORT: - case NF_CT_FTP_PASV: - return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", - ((unsigned char *)&addr)[0], - ((unsigned char *)&addr)[1], - ((unsigned char *)&addr)[2], - ((unsigned char *)&addr)[3], - port >> 8, - port & 0xFF); - case NF_CT_FTP_EPRT: - return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port); - case NF_CT_FTP_EPSV: - return snprintf(buffer, buflen, "|||%u|", port); - } - - return 0; -} - -/* So, this packet has hit the connection tracking matching code. - Mangle it, and change the expectation to match the new version. */ -static unsigned int nf_nat_ftp(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - enum nf_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) -{ - __be32 newip; - u_int16_t port; - int dir = CTINFO2DIR(ctinfo); - struct nf_conn *ct = exp->master; - char buffer[sizeof("|1|255.255.255.255|65535|")]; - unsigned int buflen; - - pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); - - /* Connection will come from wherever this packet goes, hence !dir */ - newip = ct->tuplehash[!dir].tuple.dst.u3.ip; - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = !dir; - - /* When you see the packet, we need to NAT it the same as the - * this one. */ - exp->expectfn = nf_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - - if (port == 0) - return NF_DROP; - - buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port); - if (!buflen) - goto out; - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); - - if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, buflen)) - goto out; - - return NF_ACCEPT; - -out: - nf_ct_unexpect_related(exp); - return NF_DROP; -} - -static void __exit nf_nat_ftp_fini(void) -{ - RCU_INIT_POINTER(nf_nat_ftp_hook, NULL); - synchronize_rcu(); -} - -static int __init nf_nat_ftp_init(void) -{ - BUG_ON(nf_nat_ftp_hook != NULL); - RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp); - return 0; -} - -/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); - return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(nf_nat_ftp_init); -module_exit(nf_nat_ftp_fini); diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index cad29c12131..574f7ebba0b 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -2,6 +2,7 @@ * H.323 extension for NAT alteration. * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * This source code is licensed under General Public License version 2. * @@ -15,13 +16,12 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> #include <linux/netfilter/nf_conntrack_h323.h> /****************************************************************************/ -static int set_addr(struct sk_buff *skb, +static int set_addr(struct sk_buff *skb, unsigned int protoff, unsigned char **data, int dataoff, unsigned int addroff, __be32 ip, __be16 port) { @@ -40,7 +40,7 @@ static int set_addr(struct sk_buff *skb, if (ip_hdr(skb)->protocol == IPPROTO_TCP) { if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, - addroff, sizeof(buf), + protoff, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n"); return -1; @@ -54,7 +54,7 @@ static int set_addr(struct sk_buff *skb, *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; } else { if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, - addroff, sizeof(buf), + protoff, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; @@ -69,22 +69,22 @@ static int set_addr(struct sk_buff *skb, } /****************************************************************************/ -static int set_h225_addr(struct sk_buff *skb, +static int set_h225_addr(struct sk_buff *skb, unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, union nf_inet_addr *addr, __be16 port) { - return set_addr(skb, data, dataoff, taddr->ipAddress.ip, + return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip, addr->ip, port); } /****************************************************************************/ -static int set_h245_addr(struct sk_buff *skb, +static int set_h245_addr(struct sk_buff *skb, unsigned protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, union nf_inet_addr *addr, __be16 port) { - return set_addr(skb, data, dataoff, + return set_addr(skb, protoff, data, dataoff, taddr->unicastAddress.iPAddress.network, addr->ip, port); } @@ -92,10 +92,10 @@ static int set_h245_addr(struct sk_buff *skb, /****************************************************************************/ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) { - const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + const struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int i; __be16 port; @@ -118,7 +118,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, &addr.ip, port, &ct->tuplehash[!dir].tuple.dst.u3.ip, info->sig_port[!dir]); - return set_h225_addr(skb, data, 0, &taddr[i], + return set_h225_addr(skb, protoff, data, 0, + &taddr[i], &ct->tuplehash[!dir]. tuple.dst.u3, info->sig_port[!dir]); @@ -129,7 +130,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, &addr.ip, port, &ct->tuplehash[!dir].tuple.src.u3.ip, info->sig_port[!dir]); - return set_h225_addr(skb, data, 0, &taddr[i], + return set_h225_addr(skb, protoff, data, 0, + &taddr[i], &ct->tuplehash[!dir]. tuple.src.u3, info->sig_port[!dir]); @@ -143,7 +145,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) { int dir = CTINFO2DIR(ctinfo); @@ -159,7 +161,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, &addr.ip, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3.ip, ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); - return set_h225_addr(skb, data, 0, &taddr[i], + return set_h225_addr(skb, protoff, data, 0, &taddr[i], &ct->tuplehash[!dir].tuple.dst.u3, ct->tuplehash[!dir].tuple. dst.u.udp.port); @@ -172,13 +174,13 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, __be16 port, __be16 rtp_port, struct nf_conntrack_expect *rtp_exp, struct nf_conntrack_expect *rtcp_exp) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int i; u_int16_t nated_port; @@ -227,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); nated_port = 0; break; @@ -244,7 +249,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, } /* Modify signal */ - if (set_h245_addr(skb, data, dataoff, taddr, + if (set_h245_addr(skb, protoff, data, dataoff, taddr, &ct->tuplehash[!dir].tuple.dst.u3, htons((port & htons(1)) ? nated_port + 1 : nated_port)) == 0) { @@ -275,7 +280,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) { @@ -307,7 +312,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, } /* Modify signal */ - if (set_h245_addr(skb, data, dataoff, taddr, + if (set_h245_addr(skb, protoff, data, dataoff, taddr, &ct->tuplehash[!dir].tuple.dst.u3, htons(nated_port)) < 0) { nf_ct_unexpect_related(exp); @@ -326,11 +331,11 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); u_int16_t nated_port = ntohs(port); @@ -363,7 +368,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, } /* Modify signal */ - if (set_h225_addr(skb, data, dataoff, taddr, + if (set_h225_addr(skb, protoff, data, dataoff, taddr, &ct->tuplehash[!dir].tuple.dst.u3, htons(nated_port)) == 0) { /* Save ports */ @@ -390,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_q931_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_ipv4_range range; + struct nf_nat_range range; if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ nf_nat_follow_master(new, this); @@ -402,24 +407,26 @@ static void ip_nat_q931_expect(struct nf_conn *new, /* Change src to where master sends to */ range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; + range.min_addr = range.max_addr = + new->tuplehash[!this->dir].tuple.src.u3; nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = this->saved_proto; - range.min_ip = range.max_ip = - new->master->tuplehash[!this->dir].tuple.src.u3.ip; + range.min_proto = range.max_proto = this->saved_proto; + range.min_addr = range.max_addr = + new->master->tuplehash[!this->dir].tuple.src.u3; nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); } /****************************************************************************/ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, TransportAddress *taddr, int idx, + unsigned int protoff, unsigned char **data, + TransportAddress *taddr, int idx, __be16 port, struct nf_conntrack_expect *exp) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); u_int16_t nated_port = ntohs(port); union nf_inet_addr addr; @@ -453,7 +460,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, } /* Modify signal */ - if (set_h225_addr(skb, data, 0, &taddr[idx], + if (set_h225_addr(skb, protoff, data, 0, &taddr[idx], &ct->tuplehash[!dir].tuple.dst.u3, htons(nated_port)) == 0) { /* Save ports */ @@ -464,7 +471,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, if (idx > 0 && get_h225_addr(ct, *data, &taddr[0], &addr, &port) && (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { - set_h225_addr(skb, data, 0, &taddr[0], + set_h225_addr(skb, protoff, data, 0, &taddr[0], &ct->tuplehash[!dir].tuple.dst.u3, info->sig_port[!dir]); } @@ -487,26 +494,28 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_callforwarding_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_ipv4_range range; + struct nf_nat_range range; /* This must be a fresh one. */ BUG_ON(new->status & IPS_NAT_DONE_MASK); /* Change src to where master sends to */ range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; + range.min_addr = range.max_addr = + new->tuplehash[!this->dir].tuple.src.u3; nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = this->saved_proto; - range.min_ip = range.max_ip = this->saved_ip; + range.min_proto = range.max_proto = this->saved_proto; + range.min_addr = range.max_addr = this->saved_addr; nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); } /****************************************************************************/ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) @@ -515,7 +524,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, u_int16_t nated_port; /* Set expectations for NAT */ - exp->saved_ip = exp->tuple.dst.u3.ip; + exp->saved_addr = exp->tuple.dst.u3; exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; exp->expectfn = ip_nat_callforwarding_expect; @@ -541,7 +550,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, } /* Modify signal */ - if (!set_h225_addr(skb, data, dataoff, taddr, + if (!set_h225_addr(skb, protoff, data, dataoff, taddr, &ct->tuplehash[!dir].tuple.dst.u3, htons(nated_port)) == 0) { nf_ct_unexpect_related(exp); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c deleted file mode 100644 index af65958f630..00000000000 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ /dev/null @@ -1,445 +0,0 @@ -/* ip_nat_helper.c - generic support functions for NAT helpers - * - * (C) 2000-2002 Harald Welte <laforge@netfilter.org> - * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/gfp.h> -#include <linux/kmod.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_ecache.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_helper.h> - -#define DUMP_OFFSET(x) \ - pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \ - x->offset_before, x->offset_after, x->correction_pos); - -static DEFINE_SPINLOCK(nf_nat_seqofs_lock); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, - int sizediff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way = &nat->seq[dir]; - - pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", - seq, sizediff); - - pr_debug("adjust_tcp_sequence: Seq_offset before: "); - DUMP_OFFSET(this_way); - - spin_lock_bh(&nf_nat_seqofs_lock); - - /* SYN adjust. If it's uninitialized, or this is after last - * correction, record it: we don't handle more than one - * adjustment in the window, but do deal with common case of a - * retransmit */ - if (this_way->offset_before == this_way->offset_after || - before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; - } - spin_unlock_bh(&nf_nat_seqofs_lock); - - pr_debug("adjust_tcp_sequence: Seq_offset after: "); - DUMP_OFFSET(this_way); -} - -/* Get the offset value, for conntrack */ -s16 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way; - s16 offset; - - if (!nat) - return 0; - - this_way = &nat->seq[dir]; - spin_lock_bh(&nf_nat_seqofs_lock); - offset = after(seq, this_way->correction_pos) - ? this_way->offset_after : this_way->offset_before; - spin_unlock_bh(&nf_nat_seqofs_lock); - - return offset; -} -EXPORT_SYMBOL_GPL(nf_nat_get_offset); - -/* Frobs data inside this packet, which is linear. */ -static void mangle_contents(struct sk_buff *skb, - unsigned int dataoff, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - unsigned char *data; - - BUG_ON(skb_is_nonlinear(skb)); - data = skb_network_header(skb) + dataoff; - - /* move post-replacement */ - memmove(data + match_offset + rep_len, - data + match_offset + match_len, - skb->tail - (skb->network_header + dataoff + - match_offset + match_len)); - - /* insert data from buffer */ - memcpy(data + match_offset, rep_buffer, rep_len); - - /* update skb info */ - if (rep_len > match_len) { - pr_debug("nf_nat_mangle_packet: Extending packet by " - "%u from %u bytes\n", rep_len - match_len, skb->len); - skb_put(skb, rep_len - match_len); - } else { - pr_debug("nf_nat_mangle_packet: Shrinking packet from " - "%u from %u bytes\n", match_len - rep_len, skb->len); - __skb_trim(skb, skb->len + rep_len - match_len); - } - - /* fix IP hdr checksum information */ - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); -} - -/* Unusual, but possible case. */ -static int enlarge_skb(struct sk_buff *skb, unsigned int extra) -{ - if (skb->len + extra > 65535) - return 0; - - if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) - return 0; - - return 1; -} - -void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - __be32 seq, s16 off) -{ - if (!off) - return; - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); -} -EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); - -static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, - int datalen, __sum16 *check, int oldlen) -{ - struct rtable *rt = skb_rtable(skb); - - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - iph->ihl * 4; - skb->csum_offset = (void *)check - data; - *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, iph->protocol, 0); - } else { - *check = 0; - *check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, iph->protocol, - csum_partial(data, datalen, - 0)); - if (iph->protocol == IPPROTO_UDP && !*check) - *check = CSUM_MANGLED_0; - } - } else - inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), 1); -} - -/* Generic function for mangling variable-length address changes inside - * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX - * command in FTP). - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * */ -int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len, bool adjust) -{ - struct iphdr *iph; - struct tcphdr *tcph; - int oldlen, datalen; - - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (rep_len > match_len && - rep_len - match_len > skb_tailroom(skb) && - !enlarge_skb(skb, rep_len - match_len)) - return 0; - - SKB_LINEAR_ASSERT(skb); - - iph = ip_hdr(skb); - tcph = (void *)iph + iph->ihl*4; - - oldlen = skb->len - iph->ihl*4; - mangle_contents(skb, iph->ihl*4 + tcph->doff*4, - match_offset, match_len, rep_buffer, rep_len); - - datalen = skb->len - iph->ihl*4; - nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen); - - if (adjust && rep_len != match_len) - nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, - (int)rep_len - (int)match_len); - - return 1; -} -EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); - -/* Generic function for mangling variable-length address changes inside - * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX - * command in the Amanda protocol) - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * XXX - This function could be merged with nf_nat_mangle_tcp_packet which - * should be fairly easy to do. - */ -int -nf_nat_mangle_udp_packet(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - struct iphdr *iph; - struct udphdr *udph; - int datalen, oldlen; - - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (rep_len > match_len && - rep_len - match_len > skb_tailroom(skb) && - !enlarge_skb(skb, rep_len - match_len)) - return 0; - - iph = ip_hdr(skb); - udph = (void *)iph + iph->ihl*4; - - oldlen = skb->len - iph->ihl*4; - mangle_contents(skb, iph->ihl*4 + sizeof(*udph), - match_offset, match_len, rep_buffer, rep_len); - - /* update the length of the UDP packet */ - datalen = skb->len - iph->ihl*4; - udph->len = htons(datalen); - - /* fix udp checksum if udp checksum was previously calculated */ - if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) - return 1; - - nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen); - - return 1; -} -EXPORT_SYMBOL(nf_nat_mangle_udp_packet); - -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int sackoff, - unsigned int sackend, - struct nf_nat_seq *natseq) -{ - while (sackoff < sackend) { - struct tcp_sack_block_wire *sack; - __be32 new_start_seq, new_end_seq; - - sack = (void *)skb->data + sackoff; - if (after(ntohl(sack->start_seq) - natseq->offset_before, - natseq->correction_pos)) - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_after); - else - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_before); - - if (after(ntohl(sack->end_seq) - natseq->offset_before, - natseq->correction_pos)) - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_after); - else - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_before); - - pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", - ntohl(sack->start_seq), new_start_seq, - ntohl(sack->end_seq), new_end_seq); - - inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); - sack->start_seq = new_start_seq; - sack->end_seq = new_end_seq; - sackoff += sizeof(*sack); - } -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -nf_nat_sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dir, optoff, optend; - struct nf_conn_nat *nat = nfct_nat(ct); - - optoff = ip_hdrlen(skb) + sizeof(struct tcphdr); - optend = ip_hdrlen(skb) + tcph->doff * 4; - - if (!skb_make_writable(skb, optend)) - return 0; - - dir = CTINFO2DIR(ctinfo); - - while (optoff < optend) { - /* Usually: option, length. */ - unsigned char *op = skb->data + optoff; - - switch (op[0]) { - case TCPOPT_EOL: - return 1; - case TCPOPT_NOP: - optoff++; - continue; - default: - /* no partial options */ - if (optoff + 1 == optend || - optoff + op[1] > optend || - op[1] < 2) - return 0; - if (op[0] == TCPOPT_SACK && - op[1] >= 2+TCPOLEN_SACK_PERBLOCK && - ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) - sack_adjust(skb, tcph, optoff+2, - optoff+op[1], &nat->seq[!dir]); - optoff += op[1]; - } - } - return 1; -} - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -int -nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - struct tcphdr *tcph; - int dir; - __be32 newseq, newack; - s16 seqoff, ackoff; - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way, *other_way; - - dir = CTINFO2DIR(ctinfo); - - this_way = &nat->seq[dir]; - other_way = &nat->seq[!dir]; - - if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) - return 0; - - tcph = (void *)skb->data + ip_hdrlen(skb); - if (after(ntohl(tcph->seq), this_way->correction_pos)) - seqoff = this_way->offset_after; - else - seqoff = this_way->offset_before; - - if (after(ntohl(tcph->ack_seq) - other_way->offset_before, - other_way->correction_pos)) - ackoff = other_way->offset_after; - else - ackoff = other_way->offset_before; - - newseq = htonl(ntohl(tcph->seq) + seqoff); - newack = htonl(ntohl(tcph->ack_seq) - ackoff); - - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); - - pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", - ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), - ntohl(newack)); - - tcph->seq = newseq; - tcph->ack_seq = newack; - - return nf_nat_sack_adjust(skb, tcph, ct, ctinfo); -} - -/* Setup NAT on this expected conntrack so it follows master. */ -/* If we fail to get a free NAT slot, we'll get dropped on confirm */ -void nf_nat_follow_master(struct nf_conn *ct, - struct nf_conntrack_expect *exp) -{ - struct nf_nat_ipv4_range range; - - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = exp->saved_proto; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; - nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); -} -EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c deleted file mode 100644 index 979ae165f4e..00000000000 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ /dev/null @@ -1,99 +0,0 @@ -/* IRC extension for TCP NAT alteration. - * - * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> - * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation - * based on a copy of RR's ip_nat_ftp.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/tcp.h> -#include <linux/kernel.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_irc.h> - -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("IRC (DCC) NAT helper"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_irc"); - -static unsigned int help(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) -{ - char buffer[sizeof("4294967296 65635")]; - u_int32_t ip; - u_int16_t port; - unsigned int ret; - - /* Reply comes from server. */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = IP_CT_DIR_REPLY; - exp->expectfn = nf_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - - if (port == 0) - return NF_DROP; - - ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip); - sprintf(buffer, "%u %u", ip, port); - pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", - buffer, &ip, port); - - ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, - matchoff, matchlen, buffer, - strlen(buffer)); - if (ret != NF_ACCEPT) - nf_ct_unexpect_related(exp); - return ret; -} - -static void __exit nf_nat_irc_fini(void) -{ - RCU_INIT_POINTER(nf_nat_irc_hook, NULL); - synchronize_rcu(); -} - -static int __init nf_nat_irc_init(void) -{ - BUG_ON(nf_nat_irc_hook != NULL); - RCU_INIT_POINTER(nf_nat_irc_hook, help); - return 0; -} - -/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); - return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(nf_nat_irc_init); -module_exit(nf_nat_irc_fini); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c new file mode 100644 index 00000000000..d8b2e14efdd --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -0,0 +1,281 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/secure_seq.h> +#include <net/checksum.h> +#include <net/route.h> +#include <net/ip.h> + +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv4; + +#ifdef CONFIG_XFRM +static void nf_nat_ipv4_decode_session(struct sk_buff *skb, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + unsigned long statusbit, + struct flowi *fl) +{ + const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; + struct flowi4 *fl4 = &fl->u.ip4; + + if (ct->status & statusbit) { + fl4->daddr = t->dst.u3.ip; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl4->fl4_dport = t->dst.u.all; + } + + statusbit ^= IPS_NAT_MASK; + + if (ct->status & statusbit) { + fl4->saddr = t->src.u3.ip; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl4->fl4_sport = t->src.u.all; + } +} +#endif /* CONFIG_XFRM */ + +static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, + const struct nf_nat_range *range) +{ + return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && + ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); +} + +static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t, + __be16 dport) +{ + return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport); +} + +static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *target, + enum nf_nat_manip_type maniptype) +{ + struct iphdr *iph; + unsigned int hdroff; + + if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) + return false; + + iph = (void *)skb->data + iphdroff; + hdroff = iphdroff + iph->ihl * 4; + + if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff, + target, maniptype)) + return false; + iph = (void *)skb->data + iphdroff; + + if (maniptype == NF_NAT_MANIP_SRC) { + csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); + iph->saddr = target->src.u3.ip; + } else { + csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); + iph->daddr = target->dst.u3.ip; + } + return true; +} + +static void nf_nat_ipv4_csum_update(struct sk_buff *skb, + unsigned int iphdroff, __sum16 *check, + const struct nf_conntrack_tuple *t, + enum nf_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + __be32 oldip, newip; + + if (maniptype == NF_NAT_MANIP_SRC) { + oldip = iph->saddr; + newip = t->src.u3.ip; + } else { + oldip = iph->daddr; + newip = t->dst.u3.ip; + } + inet_proto_csum_replace4(check, skb, oldip, newip, 1); +} + +static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, + u8 proto, void *data, __sum16 *check, + int datalen, int oldlen) +{ + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + if (!(rt->rt_flags & RTCF_LOCAL) && + (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + + skb_network_offset(skb) + + ip_hdrlen(skb); + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, proto, 0); + } else { + *check = 0; + *check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, proto, + csum_partial(data, datalen, + 0)); + if (proto == IPPROTO_UDP && !*check) + *check = CSUM_MANGLED_0; + } + } else + inet_proto_csum_replace2(check, skb, + htons(oldlen), htons(datalen), 1); +} + +static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_NAT_V4_MINIP]) { + range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); + range->flags |= NF_NAT_RANGE_MAP_IPS; + } + + if (tb[CTA_NAT_V4_MAXIP]) + range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); + else + range->max_addr.ip = range->min_addr.ip; + + return 0; +} + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { + .l3proto = NFPROTO_IPV4, + .in_range = nf_nat_ipv4_in_range, + .secure_port = nf_nat_ipv4_secure_port, + .manip_pkt = nf_nat_ipv4_manip_pkt, + .csum_update = nf_nat_ipv4_csum_update, + .csum_recalc = nf_nat_ipv4_csum_recalc, + .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, +#ifdef CONFIG_XFRM + .decode_session = nf_nat_ipv4_decode_session, +#endif +}; + +int nf_nat_icmp_reply_translation(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum) +{ + struct { + struct icmphdr icmp; + struct iphdr ip; + } *inside; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); + unsigned int hdrlen = ip_hdrlen(skb); + const struct nf_nat_l4proto *l4proto; + struct nf_conntrack_tuple target; + unsigned long statusbit; + + NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY); + + if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + return 0; + if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) + return 0; + + inside = (void *)skb->data + hdrlen; + if (inside->icmp.type == ICMP_REDIRECT) { + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + if (ct->status & IPS_NAT_MASK) + return 0; + } + + if (manip == NF_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply direction */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + + l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); + if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), + l4proto, &ct->tuplehash[!dir].tuple, !manip)) + return 0; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + /* Reloading "inside" here since manip_pkt may reallocate */ + inside = (void *)skb->data + hdrlen; + inside->icmp.checksum = 0; + inside->icmp.checksum = + csum_fold(skb_checksum(skb, hdrlen, + skb->len - hdrlen, 0)); + } + + /* Change outer to look like the reply to an incoming packet */ + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); + if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); + +static int __init nf_nat_l3proto_ipv4_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp); + if (err < 0) + goto err1; + err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); + if (err < 0) + goto err2; + return err; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); +err1: + return err; +} + +static void __exit nf_nat_l3proto_ipv4_exit(void) +{ + nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); +} + +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-" __stringify(AF_INET)); + +module_init(nf_nat_l3proto_ipv4_init); +module_exit(nf_nat_l3proto_ipv4_exit); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index c273d58980a..657d2307f03 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -13,6 +13,8 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * * TODO: - NAT to a unique tuple, not to TCP source port * (needs netfilter tuple reservation) */ @@ -22,7 +24,6 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_zones.h> @@ -47,9 +48,9 @@ static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_tuple t; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; - struct nf_nat_ipv4_range range; + struct nf_nat_range range; - ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; + ct_pptp_info = nfct_help_data(master); nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; /* And here goes the grand finale of corrosion... */ @@ -89,21 +90,21 @@ static void pptp_nat_expected(struct nf_conn *ct, /* Change src to where master sends to */ range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.dst.u3; if (exp->dir == IP_CT_DIR_ORIGINAL) { range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - range.min = range.max = exp->saved_proto; + range.min_proto = range.max_proto = exp->saved_proto; } nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.src.u3; if (exp->dir == IP_CT_DIR_REPLY) { range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - range.min = range.max = exp->saved_proto; + range.min_proto = range.max_proto = exp->saved_proto; } nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } @@ -113,6 +114,7 @@ static int pptp_outbound_pkt(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq) @@ -123,7 +125,7 @@ pptp_outbound_pkt(struct sk_buff *skb, __be16 new_callid; unsigned int cid_off; - ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; + ct_pptp_info = nfct_help_data(ct); nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; new_callid = ct_pptp_info->pns_call_id; @@ -175,7 +177,7 @@ pptp_outbound_pkt(struct sk_buff *skb, ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); /* mangle packet */ - if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, cid_off + sizeof(struct pptp_pkt_hdr) + sizeof(struct PptpControlHeader), sizeof(new_callid), (char *)&new_callid, @@ -192,7 +194,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, struct nf_ct_pptp_master *ct_pptp_info; struct nf_nat_pptp *nat_pptp_info; - ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; + ct_pptp_info = nfct_help_data(ct); nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; /* save original PAC call ID in nat_info */ @@ -216,6 +218,7 @@ static int pptp_inbound_pkt(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq) { @@ -268,7 +271,7 @@ pptp_inbound_pkt(struct sk_buff *skb, pr_debug("altering peer call id from 0x%04x to 0x%04x\n", ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); - if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, pcid_off + sizeof(struct pptp_pkt_hdr) + sizeof(struct PptpControlHeader), sizeof(new_pcid), (char *)&new_pcid, diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c deleted file mode 100644 index 9993bc93e10..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_common.c +++ /dev/null @@ -1,114 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/random.h> -#include <linux/ip.h> - -#include <linux/netfilter.h> -#include <linux/export.h> -#include <net/secure_seq.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> - -bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - __be16 port; - - if (maniptype == NF_NAT_MANIP_SRC) - port = tuple->src.u.all; - else - port = tuple->dst.u.all; - - return ntohs(port) >= ntohs(min->all) && - ntohs(port) <= ntohs(max->all); -} -EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); - -void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct, - u_int16_t *rover) -{ - unsigned int range_size, min, i; - __be16 *portptr; - u_int16_t off; - - if (maniptype == NF_NAT_MANIP_SRC) - portptr = &tuple->src.u.all; - else - portptr = &tuple->dst.u.all; - - /* If no range specified... */ - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == NF_NAT_MANIP_DST) - return; - - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr) < 512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.all); - range_size = ntohs(range->max.all) - min + 1; - } - - if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) - off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip, - maniptype == NF_NAT_MANIP_SRC - ? tuple->dst.u.all - : tuple->src.u.all); - else - off = *rover; - - for (i = 0; ; ++off) { - *portptr = htons(min + off % range_size); - if (++i != range_size && nf_nat_used_tuple(tuple, ct)) - continue; - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) - *rover = off; - return; - } - return; -} -EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); - -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -int nf_nat_proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_ipv4_range *range) -{ - if (tb[CTA_PROTONAT_PORT_MIN]) { - range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); - range->max.all = range->min.tcp.port; - range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - if (tb[CTA_PROTONAT_PORT_MAX]) { - range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); - range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - return 0; -} -EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range); -#endif diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c deleted file mode 100644 index 3f67138d187..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_dccp.c +++ /dev/null @@ -1,106 +0,0 @@ -/* - * DCCP NAT protocol helper - * - * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/dccp.h> - -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t dccp_port_rover; - -static void -dccp_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &dccp_port_rover); -} - -static bool -dccp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct iphdr *iph = (const void *)(skb->data + iphdroff); - struct dccp_hdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl * 4; - __be32 oldip, newip; - __be16 *portptr, oldport, newport; - int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - - if (skb->len >= hdroff + sizeof(struct dccp_hdr)) - hdrsize = sizeof(struct dccp_hdr); - - if (!skb_make_writable(skb, hdroff + hdrsize)) - return false; - - iph = (struct iphdr *)(skb->data + iphdroff); - hdr = (struct dccp_hdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - oldip = iph->saddr; - newip = tuple->src.u3.ip; - newport = tuple->src.u.dccp.port; - portptr = &hdr->dccph_sport; - } else { - oldip = iph->daddr; - newip = tuple->dst.u3.ip; - newport = tuple->dst.u.dccp.port; - portptr = &hdr->dccph_dport; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1); - inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - 0); - return true; -} - -static const struct nf_nat_protocol nf_nat_protocol_dccp = { - .protonum = IPPROTO_DCCP, - .manip_pkt = dccp_manip_pkt, - .in_range = nf_nat_proto_in_range, - .unique_tuple = dccp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_dccp_init(void) -{ - return nf_nat_protocol_register(&nf_nat_protocol_dccp); -} - -static void __exit nf_nat_proto_dccp_fini(void) -{ - nf_nat_protocol_unregister(&nf_nat_protocol_dccp); -} - -module_init(nf_nat_proto_dccp_init); -module_exit(nf_nat_proto_dccp_fini); - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("DCCP NAT protocol helper"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 46ba0b9ab98..690d890111b 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -21,6 +21,8 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * */ #include <linux/module.h> @@ -28,8 +30,7 @@ #include <linux/ip.h> #include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> MODULE_LICENSE("GPL"); @@ -38,8 +39,9 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); /* generate unique tuple ... */ static void -gre_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, +gre_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { @@ -62,8 +64,8 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, min = 1; range_size = 0xffff; } else { - min = ntohs(range->min.gre.key); - range_size = ntohs(range->max.gre.key) - min + 1; + min = ntohs(range->min_proto.gre.key); + range_size = ntohs(range->max_proto.gre.key) - min + 1; } pr_debug("min = %u, range_size = %u\n", min, range_size); @@ -80,14 +82,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, /* manipulate a GRE packet according to maniptype */ static bool -gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, +gre_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { const struct gre_hdr *greh; struct gre_hdr_pptp *pgreh; - const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); - unsigned int hdroff = iphdroff + iph->ihl * 4; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ @@ -117,24 +119,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, return true; } -static const struct nf_nat_protocol gre = { - .protonum = IPPROTO_GRE, +static const struct nf_nat_l4proto gre = { + .l4proto = IPPROTO_GRE, .manip_pkt = gre_manip_pkt, - .in_range = nf_nat_proto_in_range, + .in_range = nf_nat_l4proto_in_range, .unique_tuple = gre_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; static int __init nf_nat_proto_gre_init(void) { - return nf_nat_protocol_register(&gre); + return nf_nat_l4proto_register(NFPROTO_IPV4, &gre); } static void __exit nf_nat_proto_gre_fini(void) { - nf_nat_protocol_unregister(&gre); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre); } module_init(nf_nat_proto_gre_init); diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index b35172851ba..eb303471bcf 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -15,8 +15,7 @@ #include <linux/netfilter.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h> static bool icmp_in_range(const struct nf_conntrack_tuple *tuple, @@ -29,8 +28,9 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, } static void -icmp_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, +icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { @@ -38,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple, unsigned int range_size; unsigned int i; - range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) range_size = 0xFFFF; for (i = 0; ; ++id) { - tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + + tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + (id % range_size)); if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) return; @@ -54,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple, static bool icmp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); struct icmphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; @@ -72,12 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb, return true; } -const struct nf_nat_protocol nf_nat_protocol_icmp = { - .protonum = IPPROTO_ICMP, +const struct nf_nat_l4proto nf_nat_l4proto_icmp = { + .l4proto = IPPROTO_ICMP, .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c deleted file mode 100644 index 3cce9b6c1c2..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/sctp.h> -#include <linux/module.h> -#include <net/sctp/checksum.h> - -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t nf_sctp_port_rover; - -static void -sctp_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &nf_sctp_port_rover); -} - -static bool -sctp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); - struct sk_buff *frag; - sctp_sctphdr_t *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be32 crc32; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - iph = (struct iphdr *)(skb->data + iphdroff); - hdr = (struct sctphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.u3.ip; - hdr->source = tuple->src.u.sctp.port; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.u3.ip; - hdr->dest = tuple->dst.u.sctp.port; - } - - crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); - skb_walk_frags(skb, frag) - crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), - crc32); - crc32 = sctp_end_cksum(crc32); - hdr->checksum = crc32; - - return true; -} - -static const struct nf_nat_protocol nf_nat_protocol_sctp = { - .protonum = IPPROTO_SCTP, - .manip_pkt = sctp_manip_pkt, - .in_range = nf_nat_proto_in_range, - .unique_tuple = sctp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_sctp_init(void) -{ - return nf_nat_protocol_register(&nf_nat_protocol_sctp); -} - -static void __exit nf_nat_proto_sctp_exit(void) -{ - nf_nat_protocol_unregister(&nf_nat_protocol_sctp); -} - -module_init(nf_nat_proto_sctp_init); -module_exit(nf_nat_proto_sctp_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SCTP NAT protocol helper"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c deleted file mode 100644 index 9fb4b4e72bb..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ /dev/null @@ -1,91 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/ip.h> -#include <linux/tcp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> - -static u_int16_t tcp_port_rover; - -static void -tcp_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover); -} - -static bool -tcp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); - struct tcphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport, oldport; - int hdrsize = 8; /* TCP connection tracking guarantees this much */ - - /* this could be a inner header returned in icmp packet; in such - cases we cannot update the checksum field since it is outside of - the 8 bytes of transport layer headers we are guaranteed */ - if (skb->len >= hdroff + sizeof(struct tcphdr)) - hdrsize = sizeof(struct tcphdr); - - if (!skb_make_writable(skb, hdroff + hdrsize)) - return false; - - iph = (struct iphdr *)(skb->data + iphdroff); - hdr = (struct tcphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.u3.ip; - newport = tuple->src.u.tcp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.u3.ip; - newport = tuple->dst.u.tcp.port; - portptr = &hdr->dest; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); - return true; -} - -const struct nf_nat_protocol nf_nat_protocol_tcp = { - .protonum = IPPROTO_TCP, - .manip_pkt = tcp_manip_pkt, - .in_range = nf_nat_proto_in_range, - .unique_tuple = tcp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c deleted file mode 100644 index 9883336e628..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ /dev/null @@ -1,82 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/export.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t udp_port_rover; - -static void -udp_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover); -} - -static bool -udp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); - struct udphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - iph = (struct iphdr *)(skb->data + iphdroff); - hdr = (struct udphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.u3.ip; - newport = tuple->src.u.udp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.u3.ip; - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } - if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - 0); - if (!hdr->check) - hdr->check = CSUM_MANGLED_0; - } - *portptr = newport; - return true; -} - -const struct nf_nat_protocol nf_nat_protocol_udp = { - .protonum = IPPROTO_UDP, - .manip_pkt = udp_manip_pkt, - .in_range = nf_nat_proto_in_range, - .unique_tuple = udp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c deleted file mode 100644 index d24d10a7beb..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_udplite.c +++ /dev/null @@ -1,98 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/module.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t udplite_port_rover; - -static void -udplite_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &udplite_port_rover); -} - -static bool -udplite_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); - struct udphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - iph = (struct iphdr *)(skb->data + iphdroff); - hdr = (struct udphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.u3.ip; - newport = tuple->src.u.udp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.u3.ip; - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } - - inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); - if (!hdr->check) - hdr->check = CSUM_MANGLED_0; - - *portptr = newport; - return true; -} - -static const struct nf_nat_protocol nf_nat_protocol_udplite = { - .protonum = IPPROTO_UDPLITE, - .manip_pkt = udplite_manip_pkt, - .in_range = nf_nat_proto_in_range, - .unique_tuple = udplite_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .nlattr_to_range = nf_nat_proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_udplite_init(void) -{ - return nf_nat_protocol_register(&nf_nat_protocol_udplite); -} - -static void __exit nf_nat_proto_udplite_fini(void) -{ - nf_nat_protocol_unregister(&nf_nat_protocol_udplite); -} - -module_init(nf_nat_proto_udplite_init); -module_exit(nf_nat_proto_udplite_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c deleted file mode 100644 index e0afe8112b1..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ /dev/null @@ -1,52 +0,0 @@ -/* The "unknown" protocol. This is what is used for protocols we - * don't understand. It's returned by ip_ct_find_proto(). - */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> - -static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type manip_type, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - return true; -} - -static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_ipv4_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - /* Sorry: we can't help you; if it's not unique, we can't frob - anything. */ - return; -} - -static bool -unknown_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - return true; -} - -const struct nf_nat_protocol nf_nat_unknown_protocol = { - .manip_pkt = unknown_manip_pkt, - .in_range = unknown_in_range, - .unique_tuple = unknown_unique_tuple, -}; diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c deleted file mode 100644 index d2a9dc314e0..00000000000 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ /dev/null @@ -1,214 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* Everything about the rules for NAT. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/slab.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/bitops.h> - -#include <linux/netfilter_ipv4/ip_tables.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> - -#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ - (1 << NF_INET_POST_ROUTING) | \ - (1 << NF_INET_LOCAL_OUT) | \ - (1 << NF_INET_LOCAL_IN)) - -static const struct xt_table nat_table = { - .name = "nat", - .valid_hooks = NAT_VALID_HOOKS, - .me = THIS_MODULE, - .af = NFPROTO_IPV4, -}; - -/* Source NAT */ -static unsigned int -ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING || - par->hooknum == NF_INET_LOCAL_IN); - - ct = nf_ct_get(skb, &ctinfo); - - /* Connection must be valid and new. */ - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - NF_CT_ASSERT(par->out != NULL); - - return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_SRC); -} - -static unsigned int -ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_OUT); - - ct = nf_ct_get(skb, &ctinfo); - - /* Connection must be valid and new. */ - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - - return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_DST); -} - -static int ipt_snat_checkentry(const struct xt_tgchk_param *par) -{ - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - /* Must be a valid range */ - if (mr->rangesize != 1) { - pr_info("SNAT: multiple ranges no longer supported\n"); - return -EINVAL; - } - return 0; -} - -static int ipt_dnat_checkentry(const struct xt_tgchk_param *par) -{ - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - /* Must be a valid range */ - if (mr->rangesize != 1) { - pr_info("DNAT: multiple ranges no longer supported\n"); - return -EINVAL; - } - return 0; -} - -static unsigned int -alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - per-proto parts (hence not NF_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_ipv4_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -int nf_nat_rule_find(struct sk_buff *skb, - unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) -{ - struct net *net = nf_ct_net(ct); - int ret; - - ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); - - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - /* NUL mapping */ - ret = alloc_null_binding(ct, hooknum); - } - return ret; -} - -static struct xt_target ipt_snat_reg __read_mostly = { - .name = "SNAT", - .target = ipt_snat_target, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), - .checkentry = ipt_snat_checkentry, - .family = AF_INET, -}; - -static struct xt_target ipt_dnat_reg __read_mostly = { - .name = "DNAT", - .target = ipt_dnat_target, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), - .checkentry = ipt_dnat_checkentry, - .family = AF_INET, -}; - -static int __net_init nf_nat_rule_net_init(struct net *net) -{ - struct ipt_replace *repl; - - repl = ipt_alloc_initial_table(&nat_table); - if (repl == NULL) - return -ENOMEM; - net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl); - kfree(repl); - if (IS_ERR(net->ipv4.nat_table)) - return PTR_ERR(net->ipv4.nat_table); - return 0; -} - -static void __net_exit nf_nat_rule_net_exit(struct net *net) -{ - ipt_unregister_table(net, net->ipv4.nat_table); -} - -static struct pernet_operations nf_nat_rule_net_ops = { - .init = nf_nat_rule_net_init, - .exit = nf_nat_rule_net_exit, -}; - -int __init nf_nat_rule_init(void) -{ - int ret; - - ret = register_pernet_subsys(&nf_nat_rule_net_ops); - if (ret != 0) - goto out; - ret = xt_register_target(&ipt_snat_reg); - if (ret != 0) - goto unregister_table; - - ret = xt_register_target(&ipt_dnat_reg); - if (ret != 0) - goto unregister_snat; - - return ret; - - unregister_snat: - xt_unregister_target(&ipt_snat_reg); - unregister_table: - unregister_pernet_subsys(&nf_nat_rule_net_ops); - out: - return ret; -} - -void nf_nat_rule_cleanup(void) -{ - xt_unregister_target(&ipt_dnat_reg); - xt_unregister_target(&ipt_snat_reg); - unregister_pernet_subsys(&nf_nat_rule_net_ops); -} diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c deleted file mode 100644 index ea4a23813d2..00000000000 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ /dev/null @@ -1,568 +0,0 @@ -/* SIP extension for NAT alteration. - * - * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> - * based on RR's ip_nat_ftp.c and other modules. - * (C) 2007 United Security Providers - * (C) 2007, 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/ip.h> -#include <linux/udp.h> -#include <linux/tcp.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_sip.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); -MODULE_DESCRIPTION("SIP NAT helper"); -MODULE_ALIAS("ip_nat_sip"); - - -static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - unsigned int matchoff, unsigned int matchlen, - const char *buffer, unsigned int buflen) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct tcphdr *th; - unsigned int baseoff; - - if (nf_ct_protonum(ct) == IPPROTO_TCP) { - th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); - baseoff = ip_hdrlen(skb) + th->doff * 4; - matchoff += dataoff - baseoff; - - if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, - matchoff, matchlen, - buffer, buflen, false)) - return 0; - } else { - baseoff = ip_hdrlen(skb) + sizeof(struct udphdr); - matchoff += dataoff - baseoff; - - if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, - matchoff, matchlen, - buffer, buflen)) - return 0; - } - - /* Reload data pointer and adjust datalen value */ - *dptr = skb->data + dataoff; - *datalen += buflen - matchlen; - return 1; -} - -static int map_addr(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - unsigned int matchoff, unsigned int matchlen, - union nf_inet_addr *addr, __be16 port) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - unsigned int buflen; - __be32 newaddr; - __be16 newport; - - if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip && - ct->tuplehash[dir].tuple.src.u.udp.port == port) { - newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip; - newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; - } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip && - ct->tuplehash[dir].tuple.dst.u.udp.port == port) { - newaddr = ct->tuplehash[!dir].tuple.src.u3.ip; - newport = ct->tuplehash[!dir].tuple.src.u.udp.port; - } else - return 1; - - if (newaddr == addr->ip && newport == port) - return 1; - - buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); - - return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, - buffer, buflen); -} - -static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - enum sip_header_types type) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - unsigned int matchlen, matchoff; - union nf_inet_addr addr; - __be16 port; - - if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, - &matchoff, &matchlen, &addr, &port) <= 0) - return 1; - return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, - &addr, port); -} - -static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned int coff, matchoff, matchlen; - enum sip_header_types hdr; - union nf_inet_addr addr; - __be16 port; - int request, in_header; - - /* Basic rules: requests and responses. */ - if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { - if (ct_sip_parse_request(ct, *dptr, *datalen, - &matchoff, &matchlen, - &addr, &port) > 0 && - !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, - &addr, port)) - return NF_DROP; - request = 1; - } else - request = 0; - - if (nf_ct_protonum(ct) == IPPROTO_TCP) - hdr = SIP_HDR_VIA_TCP; - else - hdr = SIP_HDR_VIA_UDP; - - /* Translate topmost Via header and parameters */ - if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, - hdr, NULL, &matchoff, &matchlen, - &addr, &port) > 0) { - unsigned int matchend, poff, plen, buflen, n; - char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - - /* We're only interested in headers related to this - * connection */ - if (request) { - if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip || - port != ct->tuplehash[dir].tuple.src.u.udp.port) - goto next; - } else { - if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip || - port != ct->tuplehash[dir].tuple.dst.u.udp.port) - goto next; - } - - if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, - &addr, port)) - return NF_DROP; - - matchend = matchoff + matchlen; - - /* The maddr= parameter (RFC 2361) specifies where to send - * the reply. */ - if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, - "maddr=", &poff, &plen, - &addr) > 0 && - addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && - addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { - buflen = sprintf(buffer, "%pI4", - &ct->tuplehash[!dir].tuple.dst.u3.ip); - if (!mangle_packet(skb, dataoff, dptr, datalen, - poff, plen, buffer, buflen)) - return NF_DROP; - } - - /* The received= parameter (RFC 2361) contains the address - * from which the server received the request. */ - if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, - "received=", &poff, &plen, - &addr) > 0 && - addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && - addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - buflen = sprintf(buffer, "%pI4", - &ct->tuplehash[!dir].tuple.src.u3.ip); - if (!mangle_packet(skb, dataoff, dptr, datalen, - poff, plen, buffer, buflen)) - return NF_DROP; - } - - /* The rport= parameter (RFC 3581) contains the port number - * from which the server received the request. */ - if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, - "rport=", &poff, &plen, - &n) > 0 && - htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && - htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { - __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; - buflen = sprintf(buffer, "%u", ntohs(p)); - if (!mangle_packet(skb, dataoff, dptr, datalen, - poff, plen, buffer, buflen)) - return NF_DROP; - } - } - -next: - /* Translate Contact headers */ - coff = 0; - in_header = 0; - while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, - SIP_HDR_CONTACT, &in_header, - &matchoff, &matchlen, - &addr, &port) > 0) { - if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, - &addr, port)) - return NF_DROP; - } - - if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) || - !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO)) - return NF_DROP; - - return NF_ACCEPT; -} - -static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - const struct tcphdr *th; - - if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) - return; - - th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); -} - -/* Handles expected signalling connections and media streams */ -static void ip_nat_sip_expected(struct nf_conn *ct, - struct nf_conntrack_expect *exp) -{ - struct nf_nat_ipv4_range range; - - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = exp->saved_proto; - range.min_ip = range.max_ip = exp->saved_ip; - nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); - - /* Change src to where master sends to, but only if the connection - * actually came from the same source. */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == - ct->master->tuplehash[exp->dir].tuple.src.u3.ip) { - range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); - } -} - -static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - struct nf_conntrack_expect *exp, - unsigned int matchoff, - unsigned int matchlen) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - __be32 newip; - u_int16_t port; - char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - unsigned int buflen; - - /* Connection will come from reply */ - if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) - newip = exp->tuple.dst.u3.ip; - else - newip = ct->tuplehash[!dir].tuple.dst.u3.ip; - - /* If the signalling port matches the connection's source port in the - * original direction, try to use the destination port in the opposite - * direction. */ - if (exp->tuple.dst.u.udp.port == - ct->tuplehash[dir].tuple.src.u.udp.port) - port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); - else - port = ntohs(exp->tuple.dst.u.udp.port); - - exp->saved_ip = exp->tuple.dst.u3.ip; - exp->tuple.dst.u3.ip = newip; - exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; - exp->dir = !dir; - exp->expectfn = ip_nat_sip_expected; - - for (; port != 0; port++) { - int ret; - - exp->tuple.dst.u.udp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - - if (port == 0) - return NF_DROP; - - if (exp->tuple.dst.u3.ip != exp->saved_ip || - exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { - buflen = sprintf(buffer, "%pI4:%u", &newip, port); - if (!mangle_packet(skb, dataoff, dptr, datalen, - matchoff, matchlen, buffer, buflen)) - goto err; - } - return NF_ACCEPT; - -err: - nf_ct_unexpect_related(exp); - return NF_DROP; -} - -static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - unsigned int matchoff, matchlen; - char buffer[sizeof("65536")]; - int buflen, c_len; - - /* Get actual SDP length */ - if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, - SDP_HDR_VERSION, SDP_HDR_UNSPEC, - &matchoff, &matchlen) <= 0) - return 0; - c_len = *datalen - matchoff + strlen("v="); - - /* Now, update SDP length */ - if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, - &matchoff, &matchlen) <= 0) - return 0; - - buflen = sprintf(buffer, "%u", c_len); - return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, - buffer, buflen); -} - -static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - unsigned int sdpoff, - enum sdp_header_types type, - enum sdp_header_types term, - char *buffer, int buflen) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - unsigned int matchlen, matchoff; - - if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, - &matchoff, &matchlen) <= 0) - return -ENOENT; - return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, - buffer, buflen) ? 0 : -EINVAL; -} - -static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - unsigned int sdpoff, - enum sdp_header_types type, - enum sdp_header_types term, - const union nf_inet_addr *addr) -{ - char buffer[sizeof("nnn.nnn.nnn.nnn")]; - unsigned int buflen; - - buflen = sprintf(buffer, "%pI4", &addr->ip); - if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term, - buffer, buflen)) - return 0; - - return mangle_content_len(skb, dataoff, dptr, datalen); -} - -static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - unsigned int matchoff, - unsigned int matchlen, - u_int16_t port) -{ - char buffer[sizeof("nnnnn")]; - unsigned int buflen; - - buflen = sprintf(buffer, "%u", port); - if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, - buffer, buflen)) - return 0; - - return mangle_content_len(skb, dataoff, dptr, datalen); -} - -static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - unsigned int sdpoff, - const union nf_inet_addr *addr) -{ - char buffer[sizeof("nnn.nnn.nnn.nnn")]; - unsigned int buflen; - - /* Mangle session description owner and contact addresses */ - buflen = sprintf(buffer, "%pI4", &addr->ip); - if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, - SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, - buffer, buflen)) - return 0; - - switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, - SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, - buffer, buflen)) { - case 0: - /* - * RFC 2327: - * - * Session description - * - * c=* (connection information - not required if included in all media) - */ - case -ENOENT: - break; - default: - return 0; - } - - return mangle_content_len(skb, dataoff, dptr, datalen); -} - -/* So, this packet has hit the connection tracking matching code. - Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, unsigned int *datalen, - struct nf_conntrack_expect *rtp_exp, - struct nf_conntrack_expect *rtcp_exp, - unsigned int mediaoff, - unsigned int medialen, - union nf_inet_addr *rtp_addr) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - u_int16_t port; - - /* Connection will come from reply */ - if (ct->tuplehash[dir].tuple.src.u3.ip == - ct->tuplehash[!dir].tuple.dst.u3.ip) - rtp_addr->ip = rtp_exp->tuple.dst.u3.ip; - else - rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip; - - rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip; - rtp_exp->tuple.dst.u3.ip = rtp_addr->ip; - rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; - rtp_exp->dir = !dir; - rtp_exp->expectfn = ip_nat_sip_expected; - - rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip; - rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip; - rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; - rtcp_exp->dir = !dir; - rtcp_exp->expectfn = ip_nat_sip_expected; - - /* Try to get same pair of ports: if not, try to change them. */ - for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); - port != 0; port += 2) { - int ret; - - rtp_exp->tuple.dst.u.udp.port = htons(port); - ret = nf_ct_expect_related(rtp_exp); - if (ret == -EBUSY) - continue; - else if (ret < 0) { - port = 0; - break; - } - rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); - ret = nf_ct_expect_related(rtcp_exp); - if (ret == 0) - break; - else if (ret != -EBUSY) { - nf_ct_unexpect_related(rtp_exp); - port = 0; - break; - } - } - - if (port == 0) - goto err1; - - /* Update media port. */ - if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && - !ip_nat_sdp_port(skb, dataoff, dptr, datalen, - mediaoff, medialen, port)) - goto err2; - - return NF_ACCEPT; - -err2: - nf_ct_unexpect_related(rtp_exp); - nf_ct_unexpect_related(rtcp_exp); -err1: - return NF_DROP; -} - -static struct nf_ct_helper_expectfn sip_nat = { - .name = "sip", - .expectfn = ip_nat_sip_expected, -}; - -static void __exit nf_nat_sip_fini(void) -{ - RCU_INIT_POINTER(nf_nat_sip_hook, NULL); - RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL); - RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL); - RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL); - RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL); - RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL); - RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL); - nf_ct_helper_expectfn_unregister(&sip_nat); - synchronize_rcu(); -} - -static int __init nf_nat_sip_init(void) -{ - BUG_ON(nf_nat_sip_hook != NULL); - BUG_ON(nf_nat_sip_seq_adjust_hook != NULL); - BUG_ON(nf_nat_sip_expect_hook != NULL); - BUG_ON(nf_nat_sdp_addr_hook != NULL); - BUG_ON(nf_nat_sdp_port_hook != NULL); - BUG_ON(nf_nat_sdp_session_hook != NULL); - BUG_ON(nf_nat_sdp_media_hook != NULL); - RCU_INIT_POINTER(nf_nat_sip_hook, ip_nat_sip); - RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); - RCU_INIT_POINTER(nf_nat_sip_expect_hook, ip_nat_sip_expect); - RCU_INIT_POINTER(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); - RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port); - RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session); - RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media); - nf_ct_helper_expectfn_register(&sip_nat); - return 0; -} - -module_init(nf_nat_sip_init); -module_exit(nf_nat_sip_fini); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 746edec8b86..7c676671329 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -34,10 +34,11 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Author: James Morris <jmorris@intercode.com.au> + * + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/moduleparam.h> @@ -405,7 +406,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, ptr = *octets; while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { + if (!asn1_octet_decode(ctx, ptr++)) { kfree(*octets); *octets = NULL; return 0; @@ -460,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, } if (subid < 40) { - optr [0] = 0; - optr [1] = subid; + optr[0] = 0; + optr[1] = subid; } else if (subid < 80) { - optr [0] = 1; - optr [1] = subid - 40; + optr[0] = 1; + optr[1] = subid - 40; } else { - optr [0] = 2; - optr [1] = subid - 80; + optr[0] = 2; + optr[1] = subid - 80; } *len = 2; @@ -759,7 +760,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, } break; case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + if (!asn1_oid_decode(ctx, end, &lp, &len)) { kfree(id); return 0; } @@ -1197,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct, map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); } else { /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); + map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); + map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip); } if (map.from == map.to) diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c deleted file mode 100644 index 3828a422982..00000000000 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ /dev/null @@ -1,326 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/types.h> -#include <linux/icmp.h> -#include <linux/gfp.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/spinlock.h> - -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_extend.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_helper.h> -#include <linux/netfilter_ipv4/ip_tables.h> - -#ifdef CONFIG_XFRM -static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) -{ - struct flowi4 *fl4 = &fl->u.ip4; - const struct nf_conn *ct; - const struct nf_conntrack_tuple *t; - enum ip_conntrack_info ctinfo; - enum ip_conntrack_dir dir; - unsigned long statusbit; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return; - dir = CTINFO2DIR(ctinfo); - t = &ct->tuplehash[dir].tuple; - - if (dir == IP_CT_DIR_ORIGINAL) - statusbit = IPS_DST_NAT; - else - statusbit = IPS_SRC_NAT; - - if (ct->status & statusbit) { - fl4->daddr = t->dst.u3.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || - t->dst.protonum == IPPROTO_SCTP) - fl4->fl4_dport = t->dst.u.tcp.port; - } - - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - fl4->saddr = t->src.u3.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || - t->dst.protonum == IPPROTO_SCTP) - fl4->fl4_sport = t->src.u.tcp.port; - } -} -#endif - -static unsigned int -nf_nat_fn(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); - - /* We never see fragments: conntrack defrags on pre-routing - and local-out, and nf_nat_out protects post-routing. */ - NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - have dropped it. Hence it's the user's responsibilty to - packet filter it out, or implement conntrack/NAT for that - protocol. 8) --RR */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nfct_nat(ct); - if (!nat) { - /* NAT module was loaded late. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(ct, ctinfo, - hooknum, skb)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - - /* Seen it before? This can happen for loopback, retrans, - or local packets.. */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - } - - return nf_nat_packet(ct, ctinfo, hooknum, skb); -} - -static unsigned int -nf_nat_in(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - unsigned int ret; - __be32 daddr = ip_hdr(skb)->daddr; - - ret = nf_nat_fn(hooknum, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - daddr != ip_hdr(skb)->daddr) - skb_dst_drop(skb); - - return ret; -} - -static unsigned int -nf_nat_out(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_fn(hooknum, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if ((ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all) - ) - return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; - } -#endif - return ret; -} - -static unsigned int -nf_nat_local_fn(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_fn(hooknum, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (ip_xfrm_me_harder(skb)) - ret = NF_DROP; -#endif - } - return ret; -} - -/* We must be after connection tracking and before packet filtering. */ - -static struct nf_hook_ops nf_nat_ops[] __read_mostly = { - /* Before packet filtering, change destination */ - { - .hook = nf_nat_in, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = nf_nat_out, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC, - }, - /* Before packet filtering, change destination */ - { - .hook = nf_nat_local_fn, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = nf_nat_fn, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SRC, - }, -}; - -static int __init nf_nat_standalone_init(void) -{ - int ret = 0; - - need_ipv4_conntrack(); - -#ifdef CONFIG_XFRM - BUG_ON(ip_nat_decode_session != NULL); - RCU_INIT_POINTER(ip_nat_decode_session, nat_decode_session); -#endif - ret = nf_nat_rule_init(); - if (ret < 0) { - pr_err("nf_nat_init: can't setup rules.\n"); - goto cleanup_decode_session; - } - ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); - if (ret < 0) { - pr_err("nf_nat_init: can't register hooks.\n"); - goto cleanup_rule_init; - } - return ret; - - cleanup_rule_init: - nf_nat_rule_cleanup(); - cleanup_decode_session: -#ifdef CONFIG_XFRM - RCU_INIT_POINTER(ip_nat_decode_session, NULL); - synchronize_net(); -#endif - return ret; -} - -static void __exit nf_nat_standalone_fini(void) -{ - nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); - nf_nat_rule_cleanup(); -#ifdef CONFIG_XFRM - RCU_INIT_POINTER(ip_nat_decode_session, NULL); - synchronize_net(); -#endif - /* Conntrack caches are unregistered in nf_conntrack_cleanup */ -} - -module_init(nf_nat_standalone_init); -module_exit(nf_nat_standalone_fini); - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat"); diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c deleted file mode 100644 index a2901bf829c..00000000000 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ /dev/null @@ -1,51 +0,0 @@ -/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/udp.h> - -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_tftp.h> - -MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); -MODULE_DESCRIPTION("TFTP NAT helper"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_tftp"); - -static unsigned int help(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_expect *exp) -{ - const struct nf_conn *ct = exp->master; - - exp->saved_proto.udp.port - = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; - exp->dir = IP_CT_DIR_REPLY; - exp->expectfn = nf_nat_follow_master; - if (nf_ct_expect_related(exp) != 0) - return NF_DROP; - return NF_ACCEPT; -} - -static void __exit nf_nat_tftp_fini(void) -{ - RCU_INIT_POINTER(nf_nat_tftp_hook, NULL); - synchronize_rcu(); -} - -static int __init nf_nat_tftp_init(void) -{ - BUG_ON(nf_nat_tftp_hook != NULL); - RCU_INIT_POINTER(nf_nat_tftp_hook, help); - return 0; -} - -module_init(nf_nat_tftp_init); -module_exit(nf_nat_tftp_fini); diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c new file mode 100644 index 00000000000..19412a4063f --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netfilter_arp.h> +#include <net/netfilter/nf_tables.h> + +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + +static struct nft_af_info nft_af_arp __read_mostly = { + .family = NFPROTO_ARP, + .nhooks = NF_ARP_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + [NF_ARP_FORWARD] = nft_do_chain_arp, + }, +}; + +static int nf_tables_arp_init_net(struct net *net) +{ + net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.arp== NULL) + return -ENOMEM; + + memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); + + if (nft_register_afinfo(net, net->nft.arp) < 0) + goto err; + + return 0; +err: + kfree(net->nft.arp); + return -ENOMEM; +} + +static void nf_tables_arp_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.arp); + kfree(net->nft.arp); +} + +static struct pernet_operations nf_tables_arp_net_ops = { + .init = nf_tables_arp_init_net, + .exit = nf_tables_arp_exit_net, +}; + +static const struct nf_chain_type filter_arp = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_ARP, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_ARP_IN) | + (1 << NF_ARP_OUT) | + (1 << NF_ARP_FORWARD), +}; + +static int __init nf_tables_arp_init(void) +{ + int ret; + + nft_register_chain_type(&filter_arp); + ret = register_pernet_subsys(&nf_tables_arp_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_arp); + + return ret; +} + +static void __exit nf_tables_arp_exit(void) +{ + unregister_pernet_subsys(&nf_tables_arp_net_ops); + nft_unregister_chain_type(&filter_arp); +} + +module_init(nf_tables_arp_init); +module_exit(nf_tables_arp_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c new file mode 100644 index 00000000000..6820c8c4084 --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/ip.h> +#include <net/netfilter/nf_tables_ipv4.h> + +static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + +static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (unlikely(skb->len < sizeof(struct iphdr) || + ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { + if (net_ratelimit()) + pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " + "packet\n"); + return NF_ACCEPT; + } + + return nft_do_chain_ipv4(ops, skb, in, out, okfn); +} + +struct nft_af_info nft_af_ipv4 __read_mostly = { + .family = NFPROTO_IPV4, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_ipv4_output, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, +}; +EXPORT_SYMBOL_GPL(nft_af_ipv4); + +static int nf_tables_ipv4_init_net(struct net *net) +{ + net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.ipv4 == NULL) + return -ENOMEM; + + memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); + + if (nft_register_afinfo(net, net->nft.ipv4) < 0) + goto err; + + return 0; +err: + kfree(net->nft.ipv4); + return -ENOMEM; +} + +static void nf_tables_ipv4_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.ipv4); + kfree(net->nft.ipv4); +} + +static struct pernet_operations nf_tables_ipv4_net_ops = { + .init = nf_tables_ipv4_init_net, + .exit = nf_tables_ipv4_exit_net, +}; + +static const struct nf_chain_type filter_ipv4 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_ipv4_init(void) +{ + int ret; + + nft_register_chain_type(&filter_ipv4); + ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv4); + + return ret; +} + +static void __exit nf_tables_ipv4_exit(void) +{ + unregister_pernet_subsys(&nf_tables_ipv4_net_ops); + nft_unregister_chain_type(&filter_ipv4); +} + +module_init(nf_tables_ipv4_init); +module_exit(nf_tables_ipv4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(AF_INET); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c new file mode 100644 index 00000000000..3964157d826 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +/* + * NAT chains + */ + +static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + struct nft_pktinfo pkt; + unsigned int ret; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED + IP_CT_IS_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + ops->hooknum)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall through */ + case IP_CT_NEW: + if (nf_nat_initialized(ct, maniptype)) + break; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + ret = nft_do_chain(&pkt, ops); + if (ret != NF_ACCEPT) + return ret; + if (!nf_nat_initialized(ct, maniptype)) { + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } + default: + break; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); +} + +static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + __be32 daddr = ip_hdr(skb)->daddr; + unsigned int ret; + + ret = nf_nat_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + ip_hdr(skb)->daddr != daddr) { + skb_dst_drop(skb); + } + return ret; +} + +static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo __maybe_unused; + const struct nf_conn *ct __maybe_unused; + unsigned int ret; + + ret = nf_nat_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip || + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all) + return nf_xfrm_me_harder(skb, AF_INET) == 0 ? + ret : NF_DROP; + } +#endif + return ret; +} + +static unsigned int nf_nat_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int ret; + + ret = nf_nat_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + if (ip_route_me_harder(skb, RTN_UNSPEC)) + ret = NF_DROP; + } +#ifdef CONFIG_XFRM + else if (ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) + if (nf_xfrm_me_harder(skb, AF_INET)) + ret = NF_DROP; +#endif + } + return ret; +} + +static const struct nf_chain_type nft_chain_nat_ipv4 = { + .name = "nat", + .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .hooks = { + [NF_INET_PRE_ROUTING] = nf_nat_prerouting, + [NF_INET_POST_ROUTING] = nf_nat_postrouting, + [NF_INET_LOCAL_OUT] = nf_nat_output, + [NF_INET_LOCAL_IN] = nf_nat_fn, + }, +}; + +static int __init nft_chain_nat_init(void) +{ + int err; + + err = nft_register_chain_type(&nft_chain_nat_ipv4); + if (err < 0) + return err; + + return 0; +} + +static void __exit nft_chain_nat_exit(void) +{ + nft_unregister_chain_type(&nft_chain_nat_ipv4); +} + +module_init(nft_chain_nat_init); +module_exit(nft_chain_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c new file mode 100644 index 00000000000..125b66766c0 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/route.h> +#include <net/ip.h> + +static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + struct nft_pktinfo pkt; + u32 mark; + __be32 saddr, daddr; + u_int8_t tos; + const struct iphdr *iph; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + mark = skb->mark; + iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; + + ret = nft_do_chain(&pkt, ops); + if (ret != NF_DROP && ret != NF_QUEUE) { + iph = ip_hdr(skb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) + if (ip_route_me_harder(skb, RTN_UNSPEC)) + ret = NF_DROP; + } + return ret; +} + +static const struct nf_chain_type nft_chain_route_ipv4 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook, + }, +}; + +static int __init nft_chain_route_init(void) +{ + return nft_register_chain_type(&nft_chain_route_ipv4); +} + +static void __exit nft_chain_route_exit(void) +{ + nft_unregister_chain_type(&nft_chain_route_ipv4); +} + +module_init(nft_chain_route_init); +module_exit(nft_chain_route_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 00000000000..e79718a382f --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> + +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); + +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { + .type = &nft_reject_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv4_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "reject", + .ops = &nft_reject_ipv4_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv4_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv4_type); +} + +static void __exit nft_reject_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv4_type); +} + +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2c00e8bf684..044a0ddf6a7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -33,7 +33,6 @@ #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> -#include <net/ipv6.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -46,8 +45,22 @@ #include <net/inet_common.h> #include <net/checksum.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/transp_v6.h> +#endif + +struct ping_table { + struct hlist_nulls_head hash[PING_HTABLE_SIZE]; + rwlock_t lock; +}; static struct ping_table ping_table; +struct pingv6_ops pingv6_ops; +EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; @@ -58,6 +71,7 @@ static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int ma pr_debug("hash(%d) = %d\n", num, res); return res; } +EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -65,7 +79,7 @@ static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } -static int ping_v4_get_port(struct sock *sk, unsigned short ident) +int ping_get_port(struct sock *sk, unsigned short ident) { struct hlist_nulls_node *node; struct hlist_nulls_head *hlist; @@ -103,6 +117,10 @@ next_port: ping_portaddr_for_each_entry(sk2, node, hlist) { isk2 = inet_sk(sk2); + /* BUG? Why is this reuse and not reuseaddr? ping.c + * doesn't turn off SO_REUSEADDR, and it doesn't expect + * that other ping processes can steal its packets. + */ if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) @@ -125,17 +143,18 @@ fail: write_unlock_bh(&ping_table.lock); return 1; } +EXPORT_SYMBOL_GPL(ping_get_port); -static void ping_v4_hash(struct sock *sk) +void ping_hash(struct sock *sk) { - pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); + pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ } -static void ping_v4_unhash(struct sock *sk) +void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); - pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); + pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); if (sk_hashed(sk)) { write_lock_bh(&ping_table.lock); hlist_nulls_del(&sk->sk_nulls_node); @@ -146,31 +165,60 @@ static void ping_v4_unhash(struct sock *sk) write_unlock_bh(&ping_table.lock); } } +EXPORT_SYMBOL_GPL(ping_unhash); -static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr, - u16 ident, int dif) +static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; + int dif = skb->dev->ifindex; + + if (skb->protocol == htons(ETH_P_IP)) { + pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", + (int)ident, &ip_hdr(skb)->daddr, dif); +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", + (int)ident, &ipv6_hdr(skb)->daddr, dif); +#endif + } - pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", - (int)ident, &daddr, dif); read_lock_bh(&ping_table.lock); ping_portaddr_for_each_entry(sk, hnode, hslot) { isk = inet_sk(sk); - pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk, - (int)isk->inet_num, &isk->inet_rcv_saddr, - sk->sk_bound_dev_if); - pr_debug("iterate\n"); if (isk->inet_num != ident) continue; - if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) - continue; + + if (skb->protocol == htons(ETH_P_IP) && + sk->sk_family == AF_INET) { + pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, + (int) isk->inet_num, &isk->inet_rcv_saddr, + sk->sk_bound_dev_if); + + if (isk->inet_rcv_saddr && + isk->inet_rcv_saddr != ip_hdr(skb)->daddr) + continue; +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6) && + sk->sk_family == AF_INET6) { + + pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, + (int) isk->inet_num, + &sk->sk_v6_rcv_saddr, + sk->sk_bound_dev_if); + + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && + !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, + &ipv6_hdr(skb)->daddr)) + continue; +#endif + } + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; @@ -185,54 +233,56 @@ exit: return sk; } -static void inet_get_ping_group_range_net(struct net *net, gid_t *low, - gid_t *high) +static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, + kgid_t *high) { - gid_t *data = net->ipv4.sysctl_ping_group_range; + kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { - seq = read_seqbegin(&sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } -static int ping_init_sock(struct sock *sk) +int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); - gid_t group = current_egid(); - gid_t range[2]; - struct group_info *group_info = get_current_groups(); - int i, j, count = group_info->ngroups; + kgid_t group = current_egid(); + struct group_info *group_info; + int i, j, count; kgid_t low, high; + int ret = 0; - inet_get_ping_group_range_net(net, range, range+1); - low = make_kgid(&init_user_ns, range[0]); - high = make_kgid(&init_user_ns, range[1]); - if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low)) - return -EACCES; - - if (range[0] <= group && group <= range[1]) + inet_get_ping_group_range_net(net, &low, &high); + if (gid_lte(low, group) && gid_lte(group, high)) return 0; + group_info = get_current_groups(); + count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { kgid_t gid = group_info->blocks[i][j]; if (gid_lte(low, gid) && gid_lte(gid, high)) - return 0; + goto out_release_group; } count -= cp_count; } - return -EACCES; + ret = -EACCES; + +out_release_group: + put_group_info(group_info); + return ret; } +EXPORT_SYMBOL_GPL(ping_init_sock); -static void ping_close(struct sock *sk, long timeout) +void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); @@ -240,36 +290,125 @@ static void ping_close(struct sock *sk, long timeout) sk_common_release(sk); } +EXPORT_SYMBOL_GPL(ping_close); + +/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ +static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, + struct sockaddr *uaddr, int addr_len) { + struct net *net = sock_net(sk); + if (sk->sk_family == AF_INET) { + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + int chk_addr_ret; + + if (addr_len < sizeof(*addr)) + return -EINVAL; + + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", + sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); + + chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + + if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) + chk_addr_ret = RTN_LOCAL; + + if ((sysctl_ip_nonlocal_bind == 0 && + isk->freebind == 0 && isk->transparent == 0 && + chk_addr_ret != RTN_LOCAL) || + chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST) + return -EADDRNOTAVAIL; + +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_family == AF_INET6) { + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; + int addr_type, scoped, has_addr; + struct net_device *dev = NULL; + + if (addr_len < sizeof(*addr)) + return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EINVAL; + + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", + sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); + + addr_type = ipv6_addr_type(&addr->sin6_addr); + scoped = __ipv6_addr_needs_scope_id(addr_type); + if ((addr_type != IPV6_ADDR_ANY && + !(addr_type & IPV6_ADDR_UNICAST)) || + (scoped && !addr->sin6_scope_id)) + return -EINVAL; + + rcu_read_lock(); + if (addr->sin6_scope_id) { + dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + } + has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, + scoped); + rcu_read_unlock(); + + if (!(isk->freebind || isk->transparent || has_addr || + addr_type == IPV6_ADDR_ANY)) + return -EADDRNOTAVAIL; + + if (scoped) + sk->sk_bound_dev_if = addr->sin6_scope_id; +#endif + } else { + return -EAFNOSUPPORT; + } + return 0; +} + +static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) +{ + if (saddr->sa_family == AF_INET) { + struct inet_sock *isk = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) saddr; + isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (saddr->sa_family == AF_INET6) { + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; + struct ipv6_pinfo *np = inet6_sk(sk); + sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; +#endif + } +} +static void ping_clear_saddr(struct sock *sk, int dif) +{ + sk->sk_bound_dev_if = dif; + if (sk->sk_family == AF_INET) { + struct inet_sock *isk = inet_sk(sk); + isk->inet_rcv_saddr = isk->inet_saddr = 0; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); + memset(&np->saddr, 0, sizeof(np->saddr)); +#endif + } +} /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ -static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *isk = inet_sk(sk); unsigned short snum; - int chk_addr_ret; int err; + int dif = sk->sk_bound_dev_if; - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", - sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); - - chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) - chk_addr_ret = RTN_LOCAL; - - if ((sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && - chk_addr_ret != RTN_LOCAL) || - chk_addr_ret == RTN_MULTICAST || - chk_addr_ret == RTN_BROADCAST) - return -EADDRNOTAVAIL; + err = ping_check_bind_addr(sk, isk, uaddr, addr_len); + if (err) + return err; lock_sock(sk); @@ -278,42 +417,52 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto out; err = -EADDRINUSE; - isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; - snum = ntohs(addr->sin_port); - if (ping_v4_get_port(sk, snum) != 0) { - isk->inet_saddr = isk->inet_rcv_saddr = 0; + ping_set_saddr(sk, uaddr); + snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); + if (ping_get_port(sk, snum) != 0) { + ping_clear_saddr(sk, dif); goto out; } - pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n", + pr_debug("after bind(): num = %d, dif = %d\n", (int)isk->inet_num, - &isk->inet_rcv_saddr, (int)sk->sk_bound_dev_if); err = 0; - if (isk->inet_rcv_saddr) + if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +#endif + if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); +#endif + sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } +EXPORT_SYMBOL_GPL(ping_bind); /* * Is this a supported type of ICMP message? */ -static inline int ping_supported(int type, int code) +static inline int ping_supported(int family, int type, int code) { - if (type == ICMP_ECHO && code == 0) - return 1; - return 0; + return (family == AF_INET && type == ICMP_ECHO && code == 0) || + (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0); } /* @@ -321,30 +470,42 @@ static inline int ping_supported(int type, int code) * sort of error condition. */ -static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); - -void ping_err(struct sk_buff *skb, u32 info) +void ping_err(struct sk_buff *skb, int offset, u32 info) { - struct iphdr *iph = (struct iphdr *)skb->data; - struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + int family; + struct icmphdr *icmph; struct inet_sock *inet_sock; - int type = icmph->type; - int code = icmph->code; + int type; + int code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; + if (skb->protocol == htons(ETH_P_IP)) { + family = AF_INET; + type = icmp_hdr(skb)->type; + code = icmp_hdr(skb)->code; + icmph = (struct icmphdr *)(skb->data + offset); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + family = AF_INET6; + type = icmp6_hdr(skb)->icmp6_type; + code = icmp6_hdr(skb)->icmp6_code; + icmph = (struct icmphdr *) (skb->data + offset); + } else { + BUG(); + } + /* We assume the packet has already been checked by icmp_unreach */ - if (!ping_supported(icmph->type, icmph->code)) + if (!ping_supported(family, icmph->type, icmph->code)) return; - pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, - code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", + skb->protocol, type, code, ntohs(icmph->un.echo.id), + ntohs(icmph->un.echo.sequence)); - sk = ping_v4_lookup(net, iph->daddr, iph->saddr, - ntohs(icmph->un.echo.id), skb->dev->ifindex); + sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk == NULL) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ @@ -355,70 +516,83 @@ void ping_err(struct sk_buff *skb, u32 info) harderr = 0; inet_sock = inet_sk(sk); - switch (type) { - default: - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - case ICMP_SOURCE_QUENCH: - /* This is not a real error but ping wants to see it. - * Report it with some fake errno. */ - err = EREMOTEIO; - break; - case ICMP_PARAMETERPROB: - err = EPROTO; - harderr = 1; - break; - case ICMP_DEST_UNREACH: - if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ - if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { - err = EMSGSIZE; - harderr = 1; - break; + if (skb->protocol == htons(ETH_P_IP)) { + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + /* This is not a real error but ping wants to see it. + * Report it with some fake errno. + */ + err = EREMOTEIO; + break; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + ipv4_sk_update_pmtu(skb, sk, info); + if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; } - goto out; - } - err = EHOSTUNREACH; - if (code <= NR_ICMP_UNREACH) { - harderr = icmp_err_convert[code].fatal; - err = icmp_err_convert[code].errno; + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + case ICMP_REDIRECT: + /* See ICMP_SOURCE_QUENCH */ + ipv4_sk_redirect(skb, sk); + err = EREMOTEIO; + break; } - break; - case ICMP_REDIRECT: - /* See ICMP_SOURCE_QUENCH */ - err = EREMOTEIO; - break; +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); +#endif } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ - if (!inet_sock->recverr) { + if ((family == AF_INET && !inet_sock->recverr) || + (family == AF_INET6 && !inet6_sk(sk)->recverr)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { - ip_icmp_error(sk, skb, err, 0 /* no remote port */, - info, (u8 *)icmph); + if (family == AF_INET) { + ip_icmp_error(sk, skb, err, 0 /* no remote port */, + info, (u8 *)icmph); +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, + info, (u8 *)icmph); +#endif + } } sk->sk_err = err; sk->sk_error_report(sk); out: sock_put(sk); } +EXPORT_SYMBOL_GPL(ping_err); /* - * Copy and checksum an ICMP Echo packet from user space into a buffer. + * Copy and checksum an ICMP Echo packet from user space into a buffer + * starting from the payload. */ -struct pingfakehdr { - struct icmphdr icmph; - struct iovec *iov; - __wsum wcheck; -}; - -static int ping_getfrag(void *from, char *to, - int offset, int fraglen, int odd, struct sk_buff *skb) +int ping_getfrag(void *from, char *to, + int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = (struct pingfakehdr *)from; @@ -429,20 +603,33 @@ static int ping_getfrag(void *from, char *to, pfh->iov, 0, fraglen - sizeof(struct icmphdr), &pfh->wcheck)) return -EFAULT; + } else if (offset < sizeof(struct icmphdr)) { + BUG(); + } else { + if (csum_partial_copy_fromiovecend + (to, pfh->iov, offset - sizeof(struct icmphdr), + fraglen, &pfh->wcheck)) + return -EFAULT; + } - return 0; +#if IS_ENABLED(CONFIG_IPV6) + /* For IPv6, checksum each skb as we go along, as expected by + * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in + * wcheck, it will be finalized in ping_v4_push_pending_frames. + */ + if (pfh->family == AF_INET6) { + skb->csum = pfh->wcheck; + skb->ip_summed = CHECKSUM_NONE; + pfh->wcheck = 0; } - if (offset < sizeof(struct icmphdr)) - BUG(); - if (csum_partial_copy_fromiovecend - (to, pfh->iov, offset - sizeof(struct icmphdr), - fraglen, &pfh->wcheck)) - return -EFAULT; +#endif + return 0; } +EXPORT_SYMBOL_GPL(ping_getfrag); -static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, - struct flowi4 *fl4) +static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, + struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); @@ -454,24 +641,9 @@ static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, return ip_push_pending_frames(sk, fl4); } -static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) -{ - struct net *net = sock_net(sk); - struct flowi4 fl4; - struct inet_sock *inet = inet_sk(sk); - struct ipcm_cookie ipc; - struct icmphdr user_icmph; - struct pingfakehdr pfh; - struct rtable *rt = NULL; - struct ip_options_data opt_copy; - int free = 0; - __be32 saddr, daddr, faddr; - u8 tos; - int err; - - pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); - +int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, + void *user_icmph, size_t icmph_len) { + u8 type, code; if (len > 0xFFFF) return -EMSGSIZE; @@ -486,21 +658,59 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* * Fetch the ICMP header provided by the userland. - * iovec is modified! + * iovec is modified! The ICMP header is consumed. */ - - if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, - sizeof(struct icmphdr))) + if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len)) return -EFAULT; - if (!ping_supported(user_icmph.type, user_icmph.code)) + + if (family == AF_INET) { + type = ((struct icmphdr *) user_icmph)->type; + code = ((struct icmphdr *) user_icmph)->code; +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + type = ((struct icmp6hdr *) user_icmph)->icmp6_type; + code = ((struct icmp6hdr *) user_icmph)->icmp6_code; +#endif + } else { + BUG(); + } + + if (!ping_supported(family, type, code)) return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(ping_common_sendmsg); + +static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct net *net = sock_net(sk); + struct flowi4 fl4; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct icmphdr user_icmph; + struct pingfakehdr pfh; + struct rtable *rt = NULL; + struct ip_options_data opt_copy; + int free = 0; + __be32 saddr, daddr, faddr; + u8 tos; + int err; + + pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, + sizeof(user_icmph)); + if (err) + return err; + /* * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) @@ -518,12 +728,13 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; ipc.tx_flags = 0; - err = sock_tx_timestamp(sk, &ipc.tx_flags); - if (err) - return err; + ipc.ttl = 0; + ipc.tos = -1; + + sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) return err; if (ipc.opt) @@ -550,7 +761,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EINVAL; faddr = ipc.opt->opt.faddr; } - tos = RT_TOS(inet->tos); + tos = get_rttos(&ipc, inet); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -575,7 +786,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -600,13 +811,14 @@ back_from_confirm: pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.iov = msg->msg_iov; pfh.wcheck = 0; + pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else - err = ping_push_pending_frames(sk, &pfh, &fl4); + err = ping_v4_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: @@ -627,11 +839,11 @@ do_confirm: goto out; } -static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + int family = sk->sk_family; struct sk_buff *skb; int copied, err; @@ -641,11 +853,16 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto out; - if (addr_len) - *addr_len = sizeof(*sin); - - if (flags & MSG_ERRQUEUE) - return ip_recv_error(sk, msg, len); + if (flags & MSG_ERRQUEUE) { + if (family == AF_INET) { + return ip_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + return pingv6_ops.ipv6_recv_error(sk, msg, len, + addr_len); +#endif + } + } skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) @@ -664,15 +881,52 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_recv_timestamp(msg, sk, skb); - /* Copy the address. */ - if (sin) { - sin->sin_family = AF_INET; - sin->sin_port = 0 /* skb->h.uh->source */; - sin->sin_addr.s_addr = ip_hdr(skb)->saddr; - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + /* Copy the address and add cmsg data. */ + if (family == AF_INET) { + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); + + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = 0 /* skb->h.uh->source */; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + *addr_len = sizeof(*sin); + } + + if (isk->cmsg_flags) + ip_cmsg_recv(msg, skb); + +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *ip6 = ipv6_hdr(skb); + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + + if (sin6) { + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_addr = ip6->saddr; + sin6->sin6_flowinfo = 0; + if (np->sndflow) + sin6->sin6_flowinfo = ip6_flowinfo(ip6); + sin6->sin6_scope_id = + ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); + *addr_len = sizeof(*sin6); + } + + if (inet6_sk(sk)->rxopt.all) + pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); + if (skb->protocol == htons(ETH_P_IPV6) && + inet6_sk(sk)->rxopt.all) + pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); + else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) + ip_cmsg_recv(msg, skb); +#endif + } else { + BUG(); } - if (isk->cmsg_flags) - ip_cmsg_recv(msg, skb); + err = copied; done: @@ -681,8 +935,9 @@ out: pr_debug("ping_recvmsg -> %d\n", err); return err; } +EXPORT_SYMBOL_GPL(ping_recvmsg); -static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); @@ -693,6 +948,7 @@ static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } return 0; } +EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); /* @@ -703,10 +959,7 @@ void ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); - struct iphdr *iph = ip_hdr(skb); struct icmphdr *icmph = icmp_hdr(skb); - __be32 saddr = iph->saddr; - __be32 daddr = iph->daddr; /* We assume the packet has already been checked by icmp_rcv */ @@ -716,8 +969,7 @@ void ping_rcv(struct sk_buff *skb) /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); - sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), - skb->dev->ifindex); + sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk != NULL) { pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); @@ -728,6 +980,7 @@ void ping_rcv(struct sk_buff *skb) /* We're called from icmp_rcv(). kfree_skb() is done there. */ } +EXPORT_SYMBOL_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", @@ -738,13 +991,14 @@ struct proto ping_prot = { .disconnect = udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .sendmsg = ping_sendmsg, + .sendmsg = ping_v4_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_v4_hash, - .unhash = ping_v4_unhash, - .get_port = ping_v4_get_port, + .release_cb = ip4_datagram_release_cb, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, .obj_size = sizeof(struct inet_sock), }; EXPORT_SYMBOL(ping_prot); @@ -768,7 +1022,8 @@ static struct sock *ping_get_first(struct seq_file *seq, int start) continue; sk_nulls_for_each(sk, node, hslot) { - if (net_eq(sock_net(sk), net)) + if (net_eq(sock_net(sk), net) && + sk->sk_family == state->family) goto found; } } @@ -801,17 +1056,24 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : sk; } -static void *ping_seq_start(struct seq_file *seq, loff_t *pos) +void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) { struct ping_iter_state *state = seq->private; state->bucket = 0; + state->family = family; read_lock_bh(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } +EXPORT_SYMBOL_GPL(ping_seq_start); + +static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) +{ + return ping_seq_start(seq, pos, AF_INET); +} -static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) +void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; @@ -823,14 +1085,16 @@ static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } +EXPORT_SYMBOL_GPL(ping_seq_next); -static void ping_seq_stop(struct seq_file *seq, void *v) +void ping_seq_stop(struct seq_file *seq, void *v) { read_unlock_bh(&ping_table.lock); } +EXPORT_SYMBOL_GPL(ping_seq_stop); -static void ping_format_sock(struct sock *sp, struct seq_file *f, - int bucket, int *len) +static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, + int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -839,92 +1103,107 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops), len); + atomic_read(&sp->sk_drops)); } -static int ping_seq_show(struct seq_file *seq, void *v) +static int ping_v4_seq_show(struct seq_file *seq, void *v) { + seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; - int len; - ping_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len, ""); + ping_v4_format_sock(v, seq, state->bucket); } + seq_pad(seq, '\n'); return 0; } -static const struct seq_operations ping_seq_ops = { - .show = ping_seq_show, - .start = ping_seq_start, +static const struct seq_operations ping_v4_seq_ops = { + .show = ping_v4_seq_show, + .start = ping_v4_seq_start, .next = ping_seq_next, .stop = ping_seq_stop, }; static int ping_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &ping_seq_ops, + struct ping_seq_afinfo *afinfo = PDE_DATA(inode); + return seq_open_net(inode, file, &afinfo->seq_ops, sizeof(struct ping_iter_state)); } -static const struct file_operations ping_seq_fops = { +const struct file_operations ping_seq_fops = { .open = ping_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; +EXPORT_SYMBOL_GPL(ping_seq_fops); + +static struct ping_seq_afinfo ping_v4_seq_afinfo = { + .name = "icmp", + .family = AF_INET, + .seq_fops = &ping_seq_fops, + .seq_ops = { + .start = ping_v4_seq_start, + .show = ping_v4_seq_show, + .next = ping_seq_next, + .stop = ping_seq_stop, + }, +}; -static int ping_proc_register(struct net *net) +int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo) { struct proc_dir_entry *p; - int rc = 0; - - p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); + p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + afinfo->seq_fops, afinfo); if (!p) - rc = -ENOMEM; - return rc; + return -ENOMEM; + return 0; } +EXPORT_SYMBOL_GPL(ping_proc_register); -static void ping_proc_unregister(struct net *net) +void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo) { - proc_net_remove(net, "icmp"); + remove_proc_entry(afinfo->name, net->proc_net); } +EXPORT_SYMBOL_GPL(ping_proc_unregister); - -static int __net_init ping_proc_init_net(struct net *net) +static int __net_init ping_v4_proc_init_net(struct net *net) { - return ping_proc_register(net); + return ping_proc_register(net, &ping_v4_seq_afinfo); } -static void __net_exit ping_proc_exit_net(struct net *net) +static void __net_exit ping_v4_proc_exit_net(struct net *net) { - ping_proc_unregister(net); + ping_proc_unregister(net, &ping_v4_seq_afinfo); } -static struct pernet_operations ping_net_ops = { - .init = ping_proc_init_net, - .exit = ping_proc_exit_net, +static struct pernet_operations ping_v4_net_ops = { + .init = ping_v4_proc_init_net, + .exit = ping_v4_proc_exit_net, }; int __init ping_proc_init(void) { - return register_pernet_subsys(&ping_net_ops); + return register_pernet_subsys(&ping_v4_net_ops); } void ping_proc_exit(void) { - unregister_pernet_subsys(&ping_net_ops); + unregister_pernet_subsys(&ping_v4_net_ops); } #endif diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8af0d44e4e2..ae0af9386f7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -/* Following RFC4293 items are displayed in /proc/net/netstat */ +/* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), @@ -125,6 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + /* Non RFC4293 fields */ + SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), + SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; @@ -162,6 +168,7 @@ static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), + SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -172,6 +179,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -224,6 +232,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), + SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), + SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), @@ -232,7 +242,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), - SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), @@ -258,6 +267,25 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), + SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), + SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), + SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), + SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), + SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), + SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), + SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), + SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), + SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), + SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), + SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), + SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), + SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), + SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), + SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), + SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), + SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), + SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), + SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_SENTINEL }; @@ -310,22 +338,23 @@ static void icmp_put(struct seq_file *seq) struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; - seq_puts(seq, "\nIcmp: InMsgs InErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_printf(seq, " OutMsgs OutErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); - seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, "\nIcmp: %lu %lu %lu", + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } @@ -350,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); @@ -366,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -381,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udp_statistics, + snmp_fold_field(net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -392,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udplite_statistics, + snmp_fold_field(net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -429,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.net_statistics, + snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -439,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); @@ -462,28 +491,29 @@ static const struct file_operations netstat_seq_fops = { static __net_init int ip_proc_init_net(struct net *net) { - if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) + if (!proc_create("sockstat", S_IRUGO, net->proc_net, + &sockstat_seq_fops)) goto out_sockstat; - if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) + if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops)) goto out_netstat; - if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) + if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops)) goto out_snmp; return 0; out_snmp: - proc_net_remove(net, "netstat"); + remove_proc_entry("netstat", net->proc_net); out_netstat: - proc_net_remove(net, "sockstat"); + remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { - proc_net_remove(net, "snmp"); - proc_net_remove(net, "netstat"); - proc_net_remove(net, "sockstat"); + remove_proc_entry("snmp", net->proc_net); + remove_proc_entry("netstat", net->proc_net); + remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9ae5c01cd0b..46d6a1c923a 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,29 +29,33 @@ #include <net/protocol.h> const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; - -/* - * Add a protocol handler to the hash tables - */ +const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash = protocol & (MAX_INET_PROTOS - 1); + if (!prot->netns_ok) { + pr_err("Protocol %u is not namespace aware, cannot register.\n", + protocol); + return -EINVAL; + } - return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); -/* - * Remove a protocol from the hash tables. - */ +int inet_add_offload(const struct net_offload *prot, unsigned char protocol) +{ + return !cmpxchg((const struct net_offload **)&inet_offloads[protocol], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet_add_offload); int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); + int ret; - ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], prot, NULL) == prot) ? 0 : -1; synchronize_net(); @@ -59,3 +63,16 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) return ret; } EXPORT_SYMBOL(inet_del_protocol); + +int inet_del_offload(const struct net_offload *prot, unsigned char protocol) +{ + int ret; + + ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol], + prot, NULL) == prot) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet_del_offload); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4032b818f3e..2c65160565e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -111,9 +111,7 @@ EXPORT_SYMBOL_GPL(raw_unhash_sk); static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { - struct hlist_node *node; - - sk_for_each_from(sk, node) { + sk_for_each_from(sk) { struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && inet->inet_num == num && @@ -131,18 +129,20 @@ found: * 0 - deliver * 1 - block */ -static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) { - int type; + struct icmphdr _hdr; + const struct icmphdr *hdr; - if (!pskb_may_pull(skb, sizeof(struct icmphdr))) + hdr = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_hdr), &_hdr); + if (!hdr) return 1; - type = icmp_hdr(skb)->type; - if (type < 32) { + if (hdr->type < 32) { __u32 data = raw_sk(sk)->filter.data; - return ((1 << type) & data) != 0; + return ((1U << hdr->type) & data) != 0; } /* Do not block unknown ICMP types */ @@ -216,6 +216,13 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) int err = 0; int harderr = 0; + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_sk_update_pmtu(skb, sk, info); + else if (type == ICMP_REDIRECT) { + ipv4_sk_redirect(skb, sk); + return; + } + /* Report error on raw socket, if: 1. User requested ip_recverr. 2. Socket is connected (otherwise the error indication @@ -292,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { /* Charge it to the socket. */ - ipv4_pktinfo_prepare(skb); + ipv4_pktinfo_prepare(sk, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; @@ -382,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -486,7 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (msg->msg_namelen) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; @@ -512,10 +519,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -551,7 +560,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr = ipc.opt->opt.faddr; } } - tos = RT_CONN_FLAGS(sk); + tos = get_rtconn_flags(&ipc, sk); if (msg->msg_flags & MSG_DONTROUTE) tos |= RTO_ONLINK; @@ -566,7 +575,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk) | + (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); if (!inet->hdrincl) { @@ -680,17 +690,14 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; - if (addr_len) - *addr_len = sizeof(*sin); - if (flags & MSG_ERRQUEUE) { - err = ip_recv_error(sk, msg, len); + err = ip_recv_error(sk, msg, len, addr_len); goto out; } @@ -716,6 +723,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + *addr_len = sizeof(*sin); } if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); @@ -887,6 +895,7 @@ struct proto raw_prot = { .recvmsg = raw_recvmsg, .bind = raw_bind, .backlog_rcv = raw_rcv_skb, + .release_cb = ip4_datagram_release_cb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw_sock), @@ -906,9 +915,7 @@ static struct sock *raw_get_first(struct seq_file *seq) for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; - - sk_for_each(sk, node, &state->h->ht[state->bucket]) + sk_for_each(sk, &state->h->ht[state->bucket]) if (sock_net(sk) == seq_file_net(seq)) goto found; } @@ -983,11 +990,13 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } @@ -1040,7 +1049,7 @@ static const struct file_operations raw_seq_fops = { static __net_init int raw_init_net(struct net *net) { - if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) + if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops)) return -ENOMEM; return 0; @@ -1048,7 +1057,7 @@ static __net_init int raw_init_net(struct net *net) static __net_exit void raw_exit_net(struct net *net) { - proc_net_remove(net, "raw"); + remove_proc_entry("raw", net->proc_net); } static __net_initdata struct pernet_operations raw_net_ops = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98b30d08efe..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,7 +70,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -80,7 +79,6 @@ #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> -#include <linux/workqueue.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> @@ -88,11 +86,10 @@ #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> -#include <linux/prefetch.h> +#include <linux/jhash.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -116,27 +113,17 @@ #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -#define IP_MAX_MTU 0xFFF0 - #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; -static int ip_rt_gc_interval __read_mostly = 60 * HZ; -static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; -static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int rt_chain_length_max __read_mostly = 20; - -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; /* * Interface to generic destination cache. @@ -145,67 +132,36 @@ static unsigned long expires_ljiffies; static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); static unsigned int ipv4_mtu(const struct dst_entry *dst); -static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(struct dst_ops *ops); - -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) -{ -} +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer; - u32 *p = NULL; - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - - peer = rt->peer; - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; - - p = peer->metrics; - if (inet_metrics_new(peer)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); - - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; - } else { - if (rt->fi) { - fib_info_put(rt->fi); - rt->fi = NULL; - } - } - } - return p; + WARN_ON(1); + return NULL; } -static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .gc = rt_garbage_collect, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, + .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, }; @@ -232,184 +188,25 @@ const __u8 ip_tos2prio[16] = { }; EXPORT_SYMBOL(ip_tos2prio); -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - * as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - * they do so with atomic increments and with the - * lock held. - */ - -struct rt_hash_bucket { - struct rtable __rcu *chain; -}; - -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define RT_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define RT_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define RT_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define RT_HASH_LOCK_SZ 512 -# else -# define RT_HASH_LOCK_SZ 256 -# endif -#endif - -static spinlock_t *rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ - int i; - - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, - GFP_KERNEL); - if (!rt_hash_locks) - panic("IP: failed to allocate rt_hash_locks\n"); - - for (i = 0; i < RT_HASH_LOCK_SZ; i++) - spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket *rt_hash_table __read_mostly; -static unsigned int rt_hash_mask __read_mostly; -static unsigned int rt_hash_log __read_mostly; - static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) - -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) -{ - return jhash_3words((__force u32)daddr, (__force u32)saddr, - idx, genid) - & rt_hash_mask; -} - -static inline int rt_genid(struct net *net) -{ - return atomic_read(&net->ipv4.rt_genid); -} +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { - struct seq_net_private p; - int bucket; - int genid; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ - struct rt_cache_iter_state *st = seq->private; - struct rtable *r = NULL; - - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) - continue; - rcu_read_lock_bh(); - r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); - while (r) { - if (dev_net(r->dst.dev) == seq_file_net(seq) && - r->rt_genid == st->genid) - return r; - r = rcu_dereference_bh(r->dst.rt_next); - } - rcu_read_unlock_bh(); - } - return r; -} - -static struct rtable *__rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - - r = rcu_dereference_bh(r->dst.rt_next); - while (!r) { - rcu_read_unlock_bh(); - do { - if (--st->bucket < 0) - return NULL; - } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); - rcu_read_lock_bh(); - r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); - } - return r; -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->dst.dev) != seq_file_net(seq)) - continue; - if (r->rt_genid == st->genid) - break; - } - return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ - struct rtable *r = rt_cache_get_first(seq); - - if (r) - while (pos && (r = rt_cache_get_next(seq, r))) - --pos; - return pos ? NULL : r; -} - static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - struct rt_cache_iter_state *st = seq->private; if (*pos) - return rt_cache_get_idx(seq, *pos - 1); - st->genid = rt_genid(seq_file_net(seq)); + return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r; - - if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(seq); - else - r = rt_cache_get_next(seq, v); ++*pos; - return r; + return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -419,34 +216,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); - else { - struct rtable *r = v; - struct neighbour *n; - int len, HHUptod; - - rcu_read_lock(); - n = dst_get_neighbour_noref(&r->dst); - HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; - rcu_read_unlock(); - - seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" - "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->dst.dev ? r->dst.dev->name : "*", - (__force u32)r->rt_dst, - (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->dst.__refcnt), - r->dst.__use, 0, (__force u32)r->rt_src, - dst_metric_advmss(&r->dst) + 40, - dst_metric(&r->dst, RTAX_WINDOW), - (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + - dst_metric(&r->dst, RTAX_RTTVAR)), - r->rt_key_tos, - -1, - HHUptod, - r->rt_spec_dst, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); - } return 0; } @@ -459,8 +228,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &rt_cache_seq_ops, - sizeof(struct rt_cache_iter_state)); + return seq_open(file, &rt_cache_seq_ops); } static const struct file_operations rt_cache_seq_fops = { @@ -468,7 +236,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_net, + .release = seq_release, }; @@ -519,7 +287,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", dst_entries_get_slow(&ipv4_dst_ops), - st->in_hit, + 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, @@ -527,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) st->in_martian_dst, st->in_martian_src, - st->out_hit, + 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, - st->gc_total, - st->gc_ignored, - st->gc_goal_miss, - st->gc_dst_overflow, - st->in_hlist_search, - st->out_hlist_search + 0, /* st->gc_total */ + 0, /* st->gc_ignored */ + 0, /* st->gc_goal_miss */ + 0, /* st->gc_dst_overflow */ + 0, /* st->in_hlist_search */ + 0 /* st->out_hlist_search */ ); return 0; } @@ -605,8 +373,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, - &rt_cache_seq_fops); + pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + &rt_cache_seq_fops); if (!pde) goto err1; @@ -658,785 +426,306 @@ static inline int ip_rt_proc_init(void) } #endif /* CONFIG_PROC_FS */ -static inline void rt_free(struct rtable *rt) +static inline bool rt_is_expired(const struct rtable *rth) { - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); + return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } -static inline void rt_drop(struct rtable *rt) +void rt_cache_flush(struct net *net) { - ip_rt_put(rt); - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); + rt_genid_bump_ipv4(net); } -static inline int rt_fast_clean(struct rtable *rth) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) { - /* Kill broadcast/multicast entries very aggresively, if they - collide in hash table with more useful entries */ - return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rt_is_input_route(rth) && rth->dst.rt_next; -} + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + const struct rtable *rt; + struct neighbour *n; -static inline int rt_valuable(struct rtable *rth) -{ - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - (rth->peer && rth->peer->pmtu_expires); -} + rt = (const struct rtable *) dst; + if (rt->rt_gateway) + pkey = (const __be32 *) &rt->rt_gateway; + else if (skb) + pkey = &ip_hdr(skb)->daddr; -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) -{ - unsigned long age; - int ret = 0; + n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); + if (n) + return n; + return neigh_create(&arp_tbl, pkey, dev); +} - if (atomic_read(&rth->dst.__refcnt)) - goto out; +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { + atomic_t id; + u32 stamp32; +}; - age = jiffies - rth->dst.lastuse; - if ((age <= tmo1 && !rt_fast_clean(rth)) || - (age <= tmo2 && rt_valuable(rth))) - goto out; - ret = 1; -out: return ret; -} +static struct ip_ident_bucket *ip_idents __read_mostly; -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. */ -static inline u32 rt_score(struct rtable *rt) +u32 ip_idents_reserve(u32 hash, int segs) { - u32 score = jiffies - rt->dst.lastuse; - - score = ~score & ~(3<<30); - - if (rt_valuable(rt)) - score |= (1<<31); - - if (rt_is_output_route(rt) || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); - - return score; -} + struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(bucket->stamp32); + u32 now = (u32)jiffies; + u32 delta = 0; -static inline bool rt_caching(const struct net *net) -{ - return net->ipv4.current_rt_cache_rebuild_count <= - net->ipv4.sysctl_rt_cache_rebuild_count; -} + if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + delta = prandom_u32_max(now - old); -static inline bool compare_hash_inputs(const struct rtable *rt1, - const struct rtable *rt2) -{ - return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | - ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); + return atomic_add_return(segs + delta, &bucket->id) - segs; } +EXPORT_SYMBOL(ip_idents_reserve); -static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) +void __ip_select_ident(struct iphdr *iph, int segs) { - return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | - ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_mark ^ rt2->rt_mark) | - (rt1->rt_key_tos ^ rt2->rt_key_tos) | - (rt1->rt_route_iif ^ rt2->rt_route_iif) | - (rt1->rt_oif ^ rt2->rt_oif)) == 0; -} + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ - return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); -} + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); -static inline int rt_is_expired(struct rtable *rth) -{ - return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); + hash = jhash_3words((__force u32)iph->daddr, + (__force u32)iph->saddr, + iph->protocol, + ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } +EXPORT_SYMBOL(__ip_select_ident); -/* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary - */ -static void rt_do_flush(struct net *net, int process_context) +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) { - unsigned int i; - struct rtable *rth, *next; - - for (i = 0; i <= rt_hash_mask; i++) { - struct rtable __rcu **pprev; - struct rtable *list; - - if (process_context && need_resched()) - cond_resched(); - rth = rcu_access_pointer(rt_hash_table[i].chain); - if (!rth) - continue; - - spin_lock_bh(rt_hash_lock_addr(i)); - - list = NULL; - pprev = &rt_hash_table[i].chain; - rth = rcu_dereference_protected(*pprev, - lockdep_is_held(rt_hash_lock_addr(i))); - - while (rth) { - next = rcu_dereference_protected(rth->dst.rt_next, - lockdep_is_held(rt_hash_lock_addr(i))); - - if (!net || - net_eq(dev_net(rth->dst.dev), net)) { - rcu_assign_pointer(*pprev, next); - rcu_assign_pointer(rth->dst.rt_next, list); - list = rth; - } else { - pprev = &rth->dst.rt_next; - } - rth = next; - } + if (sk) { + const struct inet_sock *inet = inet_sk(sk); - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; list; list = next) { - next = rcu_dereference_protected(list->dst.rt_next, 1); - rt_free(list); - } + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; } + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); } -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - * rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - -/* - * Given a hash chain and an item in this hash chain, - * find if a previous entry has the same hash_inputs - * (but differs on tos, mark or oif) - * Returns 0 if an alias is found. - * Returns ONE if rth has no alias before itself. - */ -static int has_noalias(const struct rtable *head, const struct rtable *rth) +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, + const struct sock *sk) { - const struct rtable *aux = head; + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; - while (aux != rth) { - if (compare_hash_inputs(aux, rth)) - return 0; - aux = rcu_dereference_protected(aux->dst.rt_next, 1); - } - return ONE; + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); } -static void rt_check_expire(void) +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); } -/* - * Perturbation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(struct net *net) +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct sk_buff *skb) { - unsigned char shuffle; - - get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &net->ipv4.rt_genid); - inetpeer_invalidate_tree(AF_INET); + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); } -/* - * delay < 0 : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(struct net *net, int delay) +static inline void rt_free(struct rtable *rt) { - rt_cache_invalidate(net); - if (delay >= 0) - rt_do_flush(net, !in_softirq()); + call_rcu(&rt->dst.rcu_head, dst_rcu_free); } -/* Flush previous cache invalidated entries from the cache */ -void rt_cache_flush_batch(struct net *net) -{ - rt_do_flush(net, !in_softirq()); -} +static DEFINE_SPINLOCK(fnhe_lock); -static void rt_emergency_hash_rebuild(struct net *net) +static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { - net_warn_ratelimited("Route hash chain too long!\n"); - rt_cache_invalidate(net); -} - -/* - Short description of GC goals. - - We want to build algorithm, which will keep routing cache - at some equilibrium point, when number of aged off entries - is kept approximately equal to newly generated ones. - - Current expiration strength is variable "expire". - We try to adjust it dynamically, so that if networking - is idle expires is large enough to keep enough of warm entries, - and when load increases it reduces to limit cache size. - */ - -static int rt_garbage_collect(struct dst_ops *ops) -{ - static unsigned long expire = RT_GC_TIMEOUT; - static unsigned long last_gc; - static int rover; - static int equilibrium; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long now = jiffies; - int goal; - int entries = dst_entries_get_fast(&ipv4_dst_ops); - - /* - * Garbage collection is pretty expensive, - * do not make it too frequently. - */ - - RT_CACHE_STAT_INC(gc_total); - - if (now - last_gc < ip_rt_gc_min_interval && - entries < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); - goto out; - } + struct rtable *rt; - entries = dst_entries_get_slow(&ipv4_dst_ops); - /* Calculate number of entries, which we want to expire now. */ - goal = entries - (ip_rt_gc_elasticity << rt_hash_log); - if (goal <= 0) { - if (equilibrium < ipv4_dst_ops.gc_thresh) - equilibrium = ipv4_dst_ops.gc_thresh; - goal = entries - equilibrium; - if (goal > 0) { - equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = entries - equilibrium; - } - } else { - /* We are in dangerous area. Try to reduce cache really - * aggressively. - */ - goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = entries - goal; + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); + rt_free(rt); } - - if (now - last_gc >= ip_rt_gc_min_interval) - last_gc = now; - - if (goal <= 0) { - equilibrium += goal; - goto work_done; + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); + rt_free(rt); } - - do { - int i, k; - - for (i = rt_hash_mask, k = rover; i >= 0; i--) { - unsigned long tmo = expire; - - k = (k + 1) & rt_hash_mask; - rthp = &rt_hash_table[k].chain; - spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { - if (!rt_is_expired(rth) && - !rt_may_expire(rth, tmo, expire)) { - tmo >>= 1; - rthp = &rth->dst.rt_next; - continue; - } - *rthp = rth->dst.rt_next; - rt_free(rth); - goal--; - } - spin_unlock_bh(rt_hash_lock_addr(k)); - if (goal <= 0) - break; - } - rover = k; - - if (goal <= 0) - goto work_done; - - /* Goal is not achieved. We stop process if: - - - if expire reduced to zero. Otherwise, expire is halfed. - - if table is not full. - - if we are called from interrupt. - - jiffies check is just fallback/debug loop breaker. - We will not spin here for long time in any case. - */ - - RT_CACHE_STAT_INC(gc_goal_miss); - - if (expire == 0) - break; - - expire >>= 1; - - if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - } while (!in_softirq() && time_before_eq(jiffies, now)); - - if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - net_warn_ratelimited("dst cache overflow\n"); - RT_CACHE_STAT_INC(gc_dst_overflow); - return 1; - -work_done: - expire += ip_rt_gc_min_interval; - if (expire > ip_rt_gc_timeout || - dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || - dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; -out: return 0; } -/* - * Returns number of entries in a hash chain that have different hash_inputs - */ -static int slow_chain_length(const struct rtable *head) +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) { - int length = 0; - const struct rtable *rth = head; + struct fib_nh_exception *fnhe, *oldest; - while (rth) { - length += has_noalias(head, rth); - rth = rcu_dereference_protected(rth->dst.rt_next, 1); + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; } - return length >> FRACT_BITS; + fnhe_flush_routes(oldest); + return oldest; } -static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) +static inline u32 fnhe_hashfun(__be32 daddr) { - static const __be32 inaddr_any = 0; - struct net_device *dev = dst->dev; - const __be32 *pkey = daddr; - const struct rtable *rt; - struct neighbour *n; + u32 hval; - rt = (const struct rtable *) dst; + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); - if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) - pkey = &inaddr_any; - else if (rt->rt_gateway) - pkey = (const __be32 *) &rt->rt_gateway; - - n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); - if (n) - return n; - return neigh_create(&arp_tbl, pkey, dev); + return hval & (FNHE_HASH_SIZE - 1); } -static int rt_bind_neighbour(struct rtable *rt) +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { - struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); - if (IS_ERR(n)) - return PTR_ERR(n); - dst_set_neighbour(&rt->dst, n); + rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->dst.expires = fnhe->fnhe_expires; - return 0; + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; + rt->rt_uses_gateway = 1; + } } -static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, - struct sk_buff *skb, int ifindex) +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + u32 pmtu, unsigned long expires) { - struct rtable *rth, *cand; - struct rtable __rcu **rthp, **candp; - unsigned long now; - u32 min_score; - int chain_length; - int attempts = !in_softirq(); - -restart: - chain_length = 0; - min_score = ~(u32)0; - cand = NULL; - candp = NULL; - now = jiffies; - - if (!rt_caching(dev_net(rt->dst.dev))) { - /* - * If we're not caching, just tell the caller we - * were successful and don't touch the route. The - * caller hold the sole reference to the cache entry, and - * it will be released when the caller is done with it. - * If we drop it here, the callers have no way to resolve routes - * when we're not caching. Instead, just point *rp at rt, so - * the caller gets a single use out of the route - * Note that we do rt_free on this new route entry, so that - * once its refcount hits zero, we are still able to reap it - * (Thanks Alexey) - * Note: To avoid expensive rcu stuff for this uncached dst, - * we set DST_NOCACHE so that dst_release() can free dst without - * waiting a grace period. - */ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe; + struct rtable *rt; + unsigned int i; + int depth; + u32 hval = fnhe_hashfun(daddr); - rt->dst.flags |= DST_NOCACHE; - if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { - int err = rt_bind_neighbour(rt); - if (err) { - net_warn_ratelimited("Neighbour table failure & not caching routes\n"); - ip_rt_put(rt); - return ERR_PTR(err); - } - } + spin_lock_bh(&fnhe_lock); - goto skip_hashing; + hash = nh->nh_exceptions; + if (!hash) { + hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); + if (!hash) + goto out_unlock; + nh->nh_exceptions = hash; } - rthp = &rt_hash_table[hash].chain; - - spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (compare_keys(rth, rt) && compare_netns(rth, rt)) { - /* Put it first */ - *rthp = rth->dst.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->dst.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); - - dst_use(&rth->dst, now); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - rt_drop(rt); - if (skb) - skb_dst_set(skb, &rth->dst); - return rth; - } - - if (!atomic_read(&rth->dst.__refcnt)) { - u32 score = rt_score(rth); + hash += hval; - if (score <= min_score) { - cand = rth; - candp = rthp; - min_score = score; - } - } - - chain_length++; - - rthp = &rth->dst.rt_next; + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + break; + depth++; } - if (cand) { - /* ip_rt_gc_elasticity used to be average length of chain - * length, when exceeded gc becomes really aggressive. - * - * The second limit is less certain. At the moment it allows - * only 2 entries per bucket. We will see. - */ - if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->dst.rt_next; - rt_free(cand); + if (fnhe) { + if (gw) + fnhe->fnhe_gw = gw; + if (pmtu) { + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = max(1UL, expires); } + /* Update all cached dsts too */ + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) + fill_route_from_fnhe(rt, fnhe); + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) + fill_route_from_fnhe(rt, fnhe); } else { - if (chain_length > rt_chain_length_max && - slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { - struct net *net = dev_net(rt->dst.dev); - int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(net)) { - pr_warn("%s: %d rebuilds is over limit, route caching disabled\n", - rt->dst.dev->name, num); - } - rt_emergency_hash_rebuild(net); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, - ifindex, rt_genid(net)); - goto restart; + if (depth > FNHE_RECLAIM_DEPTH) + fnhe = fnhe_oldest(hash); + else { + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); } - } - - /* Try to bind route to arp only if it is output - route or unicast forwarding path. - */ - if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { - int err = rt_bind_neighbour(rt); - if (err) { - spin_unlock_bh(rt_hash_lock_addr(hash)); - - if (err != -ENOBUFS) { - rt_drop(rt); - return ERR_PTR(err); - } - - /* Neighbour tables are full and nothing - can be released. Try to shrink route cache, - it is most likely it holds some neighbour records. - */ - if (attempts-- > 0) { - int saved_elasticity = ip_rt_gc_elasticity; - int saved_int = ip_rt_gc_min_interval; - ip_rt_gc_elasticity = 1; - ip_rt_gc_min_interval = 0; - rt_garbage_collect(&ipv4_dst_ops); - ip_rt_gc_min_interval = saved_int; - ip_rt_gc_elasticity = saved_elasticity; - goto restart; - } - - net_warn_ratelimited("Neighbour table overflow\n"); - rt_drop(rt); - return ERR_PTR(-ENOBUFS); - } - } - - rt->dst.rt_next = rt_hash_table[hash].chain; - - /* - * Since lookup is lockfree, we must make sure - * previous writes to rt are committed to memory - * before making rt visible to other CPUS. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rt); - - spin_unlock_bh(rt_hash_lock_addr(hash)); - -skip_hashing: - if (skb) - skb_dst_set(skb, &rt->dst); - return rt; -} - -static atomic_t __rt_peer_genid = ATOMIC_INIT(0); - -static u32 rt_peer_genid(void) -{ - return atomic_read(&__rt_peer_genid); -} - -void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) -{ - struct inet_peer *peer; - - peer = inet_getpeer_v4(daddr, create); - - if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) - inet_putpeer(peer); - else - rt->rt_peer_genid = rt_peer_genid(); -} - -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; - - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) -{ - struct rtable *rt = (struct rtable *) dst; - - if (rt && !(rt->dst.flags & DST_NOPEER)) { - if (rt->peer == NULL) - rt_bind_peer(rt, rt->rt_dst, 1); - - /* If peer is attached to destination, it is never detached, - so that we need not to grab a lock to dereference it. + fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_daddr = daddr; + fnhe->fnhe_gw = gw; + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = expires; + + /* Exception created; mark the cached routes for the nexthop + * stale, so anyone caching it rechecks if this exception + * applies to them. */ - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); - return; + rt = rcu_dereference(nh->nh_rth_input); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; + + for_each_possible_cpu(i) { + struct rtable __rcu **prt; + prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + rt = rcu_dereference(*prt); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; } - } else if (!rt) - pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0)); - - ip_select_fb_ident(iph); -} -EXPORT_SYMBOL(__ip_select_ident); + } -static void rt_del(unsigned int hash, struct rtable *rt) -{ - struct rtable __rcu **rthp; - struct rtable *aux; + fnhe->fnhe_stamp = jiffies; - rthp = &rt_hash_table[hash].chain; - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - while ((aux = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { - if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->dst.rt_next; - rt_free(aux); - continue; - } - rthp = &aux->dst.rt_next; - } - spin_unlock_bh(rt_hash_lock_addr(hash)); +out_unlock: + spin_unlock_bh(&fnhe_lock); } -static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, + bool kill_route) { - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - struct neighbour *n, *old_n; - - dst_confirm(&rt->dst); + __be32 new_gw = icmp_hdr(skb)->un.gateway; + __be32 old_gw = ip_hdr(skb)->saddr; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct neighbour *n; + struct net *net; - rt->rt_gateway = peer->redirect_learned.a4; + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + break; - n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); - if (IS_ERR(n)) { - rt->rt_gateway = orig_gw; + default: return; } - old_n = xchg(&rt->dst._neighbour, n); - if (old_n) - neigh_release(old_n); - if (!(n->nud_state & NUD_VALID)) { - neigh_event_send(n, NULL); - } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); - } -} -/* called in rcu_read_lock() section */ -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, - __be32 saddr, struct net_device *dev) -{ - int s, i; - struct in_device *in_dev = __in_dev_get_rcu(dev); - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct inet_peer *peer; - struct net *net; + if (rt->rt_gateway != old_gw) + return; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; @@ -1456,72 +745,55 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (s = 0; s < 2; s++) { - for (i = 0; i < 2; i++) { - unsigned int hash; - struct rtable __rcu **rthp; - struct rtable *rt; - - hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rt = rcu_dereference(*rthp)) != NULL) { - rthp = &rt->dst.rt_next; - - if (rt->rt_key_dst != daddr || - rt->rt_key_src != skeys[s] || - rt->rt_oif != ikeys[i] || - rt_is_input_route(rt) || - rt_is_expired(rt) || - !net_eq(dev_net(rt->dst.dev), net) || - rt->dst.error || - rt->dst.dev != dev || - rt->rt_gateway != old_gw) - continue; - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - - peer = rt->peer; - if (peer) { - if (peer->redirect_learned.a4 != new_gw) { - peer->redirect_learned.a4 = new_gw; - atomic_inc(&__rt_peer_genid); - } - check_peer_redir(&rt->dst, peer); - } + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + + update_or_create_fnhe(nh, fl4->daddr, new_gw, + 0, 0); } + if (kill_route) + rt->dst.obsolete = DST_OBSOLETE_KILL; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } + neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev)) + if (IN_DEV_LOG_MARTIANS(in_dev)) { + const struct iphdr *iph = (const struct iphdr *) skb->data; + __be32 daddr = iph->daddr; + __be32 saddr = iph->saddr; + net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); + } #endif ; } -static bool peer_pmtu_expired(struct inet_peer *peer) +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); - - return orig && - time_after_eq(jiffies, orig) && - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; -} + struct rtable *rt; + struct flowi4 fl4; + const struct iphdr *iph = (const struct iphdr *) skb->data; + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; -static bool peer_pmtu_cleaned(struct inet_peer *peer) -{ - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + rt = (struct rtable *) dst; - return orig && - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; + __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __ip_do_redirect(rt, skb, &fl4, true); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -1533,14 +805,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if (rt->rt_flags & RTCF_REDIRECTED) { - unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, - rt->rt_oif, - rt_genid(dev_net(dst->dev))); - rt_del(hash, rt); + } else if ((rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) { + ip_rt_put(rt); ret = NULL; - } else if (rt->peer && peer_pmtu_expired(rt->peer)) { - dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); } } return ret; @@ -1567,6 +835,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; + struct net *net; int log_martians; rcu_read_lock(); @@ -1578,11 +847,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); if (!peer) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } @@ -1597,7 +866,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->rate_tokens >= ip_rt_redirect_number) { peer->rate_last = jiffies; - return; + goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent @@ -1607,27 +876,47 @@ void ip_rt_send_redirect(struct sk_buff *skb) time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", - &ip_hdr(skb)->saddr, rt->rt_iif, - &rt->rt_dst, &rt->rt_gateway); + &ip_hdr(skb)->saddr, inet_iif(skb), + &ip_hdr(skb)->daddr, &gw); #endif } +out_put_peer: + inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); struct rtable *rt = skb_rtable(skb); struct inet_peer *peer; unsigned long now; + struct net *net; bool send; int code; + net = dev_net(rt->dst.dev); + if (!IN_DEV_FORWARD(in_dev)) { + switch (rt->dst.error) { + case EHOSTUNREACH: + IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); + break; + + case ENETUNREACH: + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + break; + } + goto out; + } + switch (rt->dst.error) { case EINVAL: default: @@ -1637,17 +926,14 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->dst.dev), - IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); send = true; if (peer) { @@ -1660,6 +946,7 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; + inet_putpeer(peer); } if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1668,162 +955,183 @@ out: kfree_skb(skb); return 0; } -/* - * The last two values are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ + struct dst_entry *dst = &rt->dst; + struct fib_result res; -static const unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + if (dst_metric_locked(dst, RTAX_MTU)) + return; -static inline unsigned short guess_mtu(unsigned short old_mtu) -{ - int i; + if (dst->dev->mtu < mtu) + return; - for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) - if (old_mtu > mtu_plateau[i]) - return mtu_plateau[i]; - return 68; -} + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; -unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, - unsigned short new_mtu, - struct net_device *dev) -{ - unsigned short old_mtu = ntohs(iph->tot_len); - unsigned short est_mtu = 0; - struct inet_peer *peer; + if (rt->rt_pmtu == mtu && + time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) + return; - peer = inet_getpeer_v4(iph->daddr, 1); - if (peer) { - unsigned short mtu = new_mtu; + rcu_read_lock(); + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); - if (new_mtu < 68 || new_mtu >= old_mtu) { - /* BSD 4.2 derived systems incorrectly adjust - * tot_len by the IP header length, and report - * a zero MTU in the ICMP message. - */ - if (mtu == 0 && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; - mtu = guess_mtu(old_mtu); - } + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + jiffies + ip_rt_mtu_expires); + } + rcu_read_unlock(); +} - if (mtu < ip_rt_min_pmtu) - mtu = ip_rt_min_pmtu; - if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { - unsigned long pmtu_expires; +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; - pmtu_expires = jiffies + ip_rt_mtu_expires; - if (!pmtu_expires) - pmtu_expires = 1UL; + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); +} - est_mtu = mtu; - peer->pmtu_learned = mtu; - peer->pmtu_expires = pmtu_expires; - atomic_inc(&__rt_peer_genid); - } +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, + int oif, u32 mark, u8 protocol, int flow_flags) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - inet_putpeer(peer); + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } - return est_mtu ? : new_mtu; } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (!expires) - return; - if (time_before(jiffies, expires)) { - u32 orig_dst_mtu = dst_mtu(dst); - if (peer->pmtu_learned < orig_dst_mtu) { - if (!peer->pmtu_orig) - peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); - dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); - } - } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) - dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); + } } -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + struct dst_entry *odst = NULL; + bool new = false; - dst_confirm(dst); + bh_lock_sock(sk); - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; - if (peer) { - unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); + if (!ip_sk_accept_pmtu(sk)) + goto out; - if (mtu < ip_rt_min_pmtu) - mtu = ip_rt_min_pmtu; - if (!pmtu_expires || mtu < peer->pmtu_learned) { + odst = sk_dst_get(sk); - pmtu_expires = jiffies + ip_rt_mtu_expires; - if (!pmtu_expires) - pmtu_expires = 1UL; + if (sock_owned_by_user(sk) || !odst) { + __ipv4_sk_update_pmtu(skb, sk, mtu); + goto out; + } - peer->pmtu_learned = mtu; - peer->pmtu_expires = pmtu_expires; + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - atomic_inc(&__rt_peer_genid); - rt->rt_peer_genid = rt_peer_genid(); - } - check_peer_pmtu(dst, peer); + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; } -} + __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); -static void ipv4_validate_peer(struct rtable *rt) -{ - if (rt->rt_peer_genid != rt_peer_genid()) { - struct inet_peer *peer; + if (!dst_check(&rt->dst, 0)) { + if (new) + dst_release(&rt->dst); - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 0); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; - peer = rt->peer; - if (peer) { - check_peer_pmtu(&rt->dst, peer); + new = true; + } - if (peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) - check_peer_redir(&rt->dst, peer); - } + if (new) + sk_dst_set(sk, &rt->dst); - rt->rt_peer_genid = rt_peer_genid(); - } +out: + bh_unlock_sock(sk); + dst_release(odst); } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags) { - struct rtable *rt = (struct rtable *) dst; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (rt_is_expired(rt)) - return NULL; - ipv4_validate_peer(rt); - return dst; + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); + } } +EXPORT_SYMBOL_GPL(ipv4_redirect); -static void ipv4_dst_destroy(struct dst_entry *dst) +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer = rt->peer; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (rt->fi) { - fib_info_put(rt->fi); - rt->fi = NULL; - } - if (peer) { - rt->peer = NULL; - inet_putpeer(peer); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rtable *rt = (struct rtable *) dst; + + /* All IPV4 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + * + * When a PMTU/redirect information update invalidates a route, + * this is indicated by setting obsolete to DST_OBSOLETE_KILL or + * DST_OBSOLETE_DEAD by dst_free(). + */ + if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) + return NULL; + return dst; +} static void ipv4_link_failure(struct sk_buff *skb) { @@ -1832,11 +1140,11 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) - dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); + if (rt) + dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1880,8 +1188,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else - src = inet_select_addr(rt->dst.dev, rt->rt_gateway, - RT_SCOPE_UNIVERSE); + src = inet_select_addr(rt->dst.dev, + rt_nexthop(rt, iph->daddr), + RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); @@ -1913,95 +1222,204 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) static unsigned int ipv4_mtu(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *) dst; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + unsigned int mtu = rt->rt_pmtu; - if (mtu && rt_is_output_route(rt)) + if (!mtu || time_after_eq(jiffies, rt->dst.expires)) + mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) return mtu; mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - - if (rt->rt_gateway != rt->rt_dst && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } - if (mtu > IP_MAX_MTU) - mtu = IP_MAX_MTU; + return min_t(unsigned int, mtu, IP_MAX_MTU); +} + +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; - return mtu; + if (!hash) + return NULL; + + hval = fnhe_hashfun(daddr); + + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + return fnhe; + } + return NULL; } -static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, - struct fib_info *fi) +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, + __be32 daddr) { - struct inet_peer *peer; - int create = 0; + bool ret = false; - /* If a peer entry exists for this destination, we must hook - * it up in order to get at cached metrics. - */ - if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) - create = 1; + spin_lock_bh(&fnhe_lock); - rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); - if (peer) { - rt->rt_peer_genid = rt_peer_genid(); - if (inet_metrics_new(peer)) - memcpy(peer->metrics, fi->fib_metrics, - sizeof(u32) * RTAX_MAX); - dst_init_metrics(&rt->dst, peer->metrics, false); - - check_peer_pmtu(&rt->dst, peer); - - if (peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) { - rt->rt_gateway = peer->redirect_learned.a4; - rt->rt_flags |= RTCF_REDIRECTED; + if (daddr == fnhe->fnhe_daddr) { + struct rtable __rcu **porig; + struct rtable *orig; + int genid = fnhe_genid(dev_net(rt->dst.dev)); + + if (rt_is_input_route(rt)) + porig = &fnhe->fnhe_rth_input; + else + porig = &fnhe->fnhe_rth_output; + orig = rcu_dereference(*porig); + + if (fnhe->fnhe_genid != genid) { + fnhe->fnhe_genid = genid; + fnhe->fnhe_gw = 0; + fnhe->fnhe_pmtu = 0; + fnhe->fnhe_expires = 0; + fnhe_flush_routes(fnhe); + orig = NULL; + } + fill_route_from_fnhe(rt, fnhe); + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + + if (!(rt->dst.flags & DST_NOCACHE)) { + rcu_assign_pointer(*porig, rt); + if (orig) + rt_free(orig); + ret = true; } + + fnhe->fnhe_stamp = jiffies; + } + spin_unlock_bh(&fnhe_lock); + + return ret; +} + +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ + struct rtable *orig, *prev, **p; + bool ret = true; + + if (rt_is_input_route(rt)) { + p = (struct rtable **)&nh->nh_rth_input; } else { - if (fi->fib_metrics != (u32 *) dst_default_metrics) { - rt->fi = fi; - atomic_inc(&fi->fib_clntref); + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } + orig = *p; + + prev = cmpxchg(p, orig, rt); + if (prev == orig) { + if (orig) + rt_free(orig); + } else + ret = false; + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + + if (!list_empty(&rt->rt_uncached)) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + spin_unlock_bh(&rt_uncached_lock); } } -static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, +static bool rt_cache_valid(const struct rtable *rt) +{ + return rt && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + !rt_is_expired(rt); +} + +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, + struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag) { - struct dst_entry *dst = &rt->dst; + bool cached = false; if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); - rt_init_metrics(rt, fl4, fi); + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { + rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID - dst->tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = nh->nh_tclassid; #endif - } - - if (dst_mtu(dst) > IP_MAX_MTU) - dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); - if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) - dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); + if (unlikely(fnhe)) + cached = rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_cache_route(nh, rt); + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } + } else + rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES - set_class_tag(rt, fib_rules_tclass(res)); + set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } static struct rtable *rt_dst_alloc(struct net_device *dev, - bool nopolicy, bool noxfrm) + bool nopolicy, bool noxfrm, bool will_cache) { - return dst_alloc(&ipv4_dst_ops, dev, 1, -1, - DST_HOST | + return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -2010,9 +1428,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned int hash; struct rtable *rth; - __be32 spec_dst; struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; int err; @@ -2023,21 +1439,24 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) + skb->protocol != htons(ETH_P_IP)) goto e_inval; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(saddr)) + goto e_inval; + if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) goto e_inval; - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) goto e_err; } rth = rt_dst_alloc(dev_net(dev)->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -2046,23 +1465,15 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->dst.output = ip_rt_bug; - rth->rt_key_dst = daddr; - rth->rt_key_src = saddr; - rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; - rth->rt_key_tos = tos; - rth->rt_dst = daddr; - rth->rt_src = saddr; - rth->rt_route_iif = dev->ifindex; - rth->rt_iif = dev->ifindex; - rth->rt_oif = 0; - rth->rt_mark = skb->mark; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_peer_genid = 0; - rth->peer = NULL; - rth->fi = NULL; + rth->rt_is_input= 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -2074,9 +1485,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif RT_CACHE_STAT_INC(in_slow_mc); - hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - rth = rt_intern_hash(hash, rth, skb, dev->ifindex); - return IS_ERR(rth) ? PTR_ERR(rth) : 0; + skb_dst_set(skb, &rth->dst); + return 0; e_nobufs: return -ENOBUFS; @@ -2116,15 +1526,15 @@ static void ip_handle_martian_source(struct net_device *dev, static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) + __be32 daddr, __be32 saddr, u32 tos) { + struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; unsigned int flags = 0; - __be32 spec_dst; - u32 itag; + bool do_cache; + u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); @@ -2133,9 +1543,8 @@ static int __mkroute_input(struct sk_buff *skb, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2143,13 +1552,13 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - if (err) - flags |= RTCF_DIRECTSRC; - - if (out_dev == in_dev && err && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -2166,38 +1575,44 @@ static int __mkroute_input(struct sk_buff *skb, } } + fnhe = find_exception(&FIB_RES_NH(*res), daddr); + if (do_cache) { + if (fnhe != NULL) + rth = rcu_dereference(fnhe->fnhe_rth_input); + else + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; + } + } + rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM)); + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; } - rth->rt_key_dst = daddr; - rth->rt_key_src = saddr; - rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); + rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; - rth->rt_key_tos = tos; - rth->rt_dst = daddr; - rth->rt_src = saddr; - rth->rt_route_iif = in_dev->dev->ifindex; - rth->rt_iif = in_dev->dev->ifindex; - rth->rt_oif = 0; - rth->rt_mark = skb->mark; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_peer_genid = 0; - rth->peer = NULL; - rth->fi = NULL; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; - rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); - - *result = rth; + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + skb_dst_set(skb, &rth->dst); +out: err = 0; cleanup: return err; @@ -2209,27 +1624,13 @@ static int ip_mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { - struct rtable *rth = NULL; - int err; - unsigned int hash; - #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) fib_select_multipath(res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); - if (err) - return err; - - /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl4->flowi4_iif, - rt_genid(dev_net(rth->dst.dev))); - rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); - if (IS_ERR(rth)) - return PTR_ERR(rth); - return 0; + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } /* @@ -2252,10 +1653,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned int flags = 0; u32 itag = 0; struct rtable *rth; - unsigned int hash; - __be32 spec_dst; int err = -EINVAL; struct net *net = dev_net(dev); + bool do_cache; /* IP on this device is disabled. */ @@ -2266,10 +1666,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; + res.fi = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -2279,9 +1679,20 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) goto martian_source; - if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) + if (ipv4_is_zeronet(daddr)) goto martian_destination; + /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), + * and call it once if daddr or/and saddr are loopback addresses + */ + if (ipv4_is_loopback(daddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_destination; + } else if (ipv4_is_loopback(saddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + } + /* * Now we are ready to route packet. */ @@ -2295,29 +1706,25 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = fib_lookup(net, &fl4, &res); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + err = -EHOSTUNREACH; goto no_route; } - RT_CACHE_STAT_INC(in_slow_tot); - if (res.type == RTN_BROADCAST) goto brd_input; if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, - net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + 0, dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; - if (err) - flags |= RTCF_DIRECTSRC; - spec_dst = daddr; goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; + goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; @@ -2328,23 +1735,32 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ipv4_is_zeronet(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + if (!ipv4_is_zeronet(saddr)) { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) goto martian_source_keep_err; - if (err) - flags |= RTCF_DIRECTSRC; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: + do_cache = false; + if (res.fi) { + if (!itag) { + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; + } + do_cache = true; + } + } + rth = rt_dst_alloc(net->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -2354,41 +1770,33 @@ local_input: rth->dst.tclassid = itag; #endif - rth->rt_key_dst = daddr; - rth->rt_key_src = saddr; - rth->rt_genid = rt_genid(net); + rth->rt_genid = rt_genid_ipv4(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; - rth->rt_key_tos = tos; - rth->rt_dst = daddr; - rth->rt_src = saddr; -#ifdef CONFIG_IP_ROUTE_CLASSID - rth->dst.tclassid = itag; -#endif - rth->rt_route_iif = dev->ifindex; - rth->rt_iif = dev->ifindex; - rth->rt_oif = 0; - rth->rt_mark = skb->mark; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_peer_genid = 0; - rth->peer = NULL; - rth->fi = NULL; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); - rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + if (do_cache) { + if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + rth->dst.flags |= DST_NOCACHE; + rt_add_uncached_list(rth); + } + } + skb_dst_set(skb, &rth->dst); err = 0; - if (IS_ERR(rth)) - err = PTR_ERR(rth); goto out; no_route: RT_CACHE_STAT_INC(in_no_route); - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; @@ -2405,10 +1813,6 @@ martian_destination: &daddr, &saddr, dev->name); #endif -e_hostunreach: - err = -EHOSTUNREACH; - goto out; - e_inval: err = -EINVAL; goto out; @@ -2424,50 +1828,13 @@ martian_source_keep_err: goto out; } -int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { - struct rtable *rth; - unsigned int hash; - int iif = dev->ifindex; - struct net *net; int res; - net = dev_net(dev); - rcu_read_lock(); - if (!rt_caching(net)) - goto skip_cache; - - tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | - ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | - (rth->rt_route_iif ^ iif) | - (rth->rt_key_tos ^ tos)) == 0 && - rth->rt_mark == skb->mark && - net_eq(dev_net(rth->dst.dev), net) && - !rt_is_expired(rth)) { - ipv4_validate_peer(rth); - if (noref) { - dst_use_noref(&rth->dst, jiffies); - skb_dst_set_noref(skb, &rth->dst); - } else { - dst_use(&rth->dst, jiffies); - skb_dst_set(skb, &rth->dst); - } - RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); - return 0; - } - RT_CACHE_STAT_INC(in_hlist_search); - } - -skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2505,24 +1872,29 @@ skip_cache: rcu_read_unlock(); return res; } -EXPORT_SYMBOL(ip_route_input_common); +EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, - const struct flowi4 *fl4, - __be32 orig_daddr, __be32 orig_saddr, - int orig_oif, __u8 orig_rtos, + const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; + struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; + bool do_cache; - if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) return ERR_PTR(-EINVAL); + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); + if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) @@ -2533,10 +1905,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - in_dev = __in_dev_get_rcu(dev_out); - if (!in_dev) - return ERR_PTR(-EINVAL); - + do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; @@ -2545,6 +1914,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; + else + do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. @@ -2553,40 +1924,57 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fi = NULL; } + fnhe = NULL; + do_cache &= fi != NULL; + if (do_cache) { + struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); + + fnhe = find_exception(nh, fl4->daddr); + if (fnhe) + prth = &fnhe->fnhe_rth_output; + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; + } + } + +add: rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM)); + IN_DEV_CONF_GET(in_dev, NOXFRM), + do_cache); if (!rth) return ERR_PTR(-ENOBUFS); rth->dst.output = ip_output; - rth->rt_key_dst = orig_daddr; - rth->rt_key_src = orig_saddr; - rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; - rth->rt_key_tos = orig_rtos; - rth->rt_dst = fl4->daddr; - rth->rt_src = fl4->saddr; - rth->rt_route_iif = 0; - rth->rt_iif = orig_oif ? : dev_out->ifindex; - rth->rt_oif = orig_oif; - rth->rt_mark = fl4->flowi4_mark; - rth->rt_gateway = fl4->daddr; - rth->rt_spec_dst= fl4->saddr; - rth->rt_peer_genid = 0; - rth->peer = NULL; - rth->fi = NULL; + rth->rt_is_input = 0; + rth->rt_iif = orig_oif ? : 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) { + if (flags & RTCF_LOCAL) rth->dst.input = ip_local_deliver; - rth->rt_spec_dst = fl4->daddr; - } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; @@ -2603,37 +1991,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res, #endif } - rt_set_nexthop(rth, fl4, res, fi, type, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); return rth; } /* * Major route resolver routine. - * called with rcu_read_lock(); */ -static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); unsigned int flags = 0; struct fib_result res; struct rtable *rth; - __be32 orig_daddr; - __be32 orig_saddr; int orig_oif; + res.tclassid = 0; res.fi = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif + res.table = NULL; - orig_daddr = fl4->daddr; - orig_saddr = fl4->saddr; orig_oif = fl4->flowi4_oif; - fl4->flowi4_iif = net->loopback_dev->ifindex; + fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); @@ -2707,7 +2089,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) RT_SCOPE_LINK); goto make_route; } - if (fl4->saddr) { + if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); @@ -2722,7 +2104,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - fl4->flowi4_oif = net->loopback_dev->ifindex; + fl4->flowi4_oif = LOOPBACK_IFINDEX; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2730,6 +2112,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) if (fib_lookup(net, fl4, &res)) { res.fi = NULL; + res.table = NULL; if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2768,7 +2151,6 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; - res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } @@ -2791,60 +2173,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) make_route: - rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, - tos, dev_out, flags); - if (!IS_ERR(rth)) { - unsigned int hash; - - hash = rt_hash(orig_daddr, orig_saddr, orig_oif, - rt_genid(dev_net(dev_out))); - rth = rt_intern_hash(hash, rth, NULL, orig_oif); - } + rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); out: rcu_read_unlock(); return rth; } - -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) -{ - struct rtable *rth; - unsigned int hash; - - if (!rt_caching(net)) - goto slow_output; - - hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); - - rcu_read_lock_bh(); - for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; - rth = rcu_dereference_bh(rth->dst.rt_next)) { - if (rth->rt_key_dst == flp4->daddr && - rth->rt_key_src == flp4->saddr && - rt_is_output_route(rth) && - rth->rt_oif == flp4->flowi4_oif && - rth->rt_mark == flp4->flowi4_mark && - !((rth->rt_key_tos ^ flp4->flowi4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->dst.dev), net) && - !rt_is_expired(rth)) { - ipv4_validate_peer(rth); - dst_use(&rth->dst, jiffies); - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - if (!flp4->saddr) - flp4->saddr = rth->rt_src; - if (!flp4->daddr) - flp4->daddr = rth->rt_dst; - return rth; - } - RT_CACHE_STAT_INC(out_hlist_search); - } - rcu_read_unlock_bh(); - -slow_output: - return ip_route_output_slow(net, flp4); -} EXPORT_SYMBOL_GPL(__ip_route_output_key); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) @@ -2859,7 +2193,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) return mtu ? : dst->dev->mtu; } -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } @@ -2872,53 +2212,43 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .redirect = ipv4_rt_blackhole_redirect, .cow_metrics = ipv4_rt_blackhole_cow_metrics, .neigh_lookup = ipv4_neigh_lookup, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *rt; + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; - new->output = dst_discard; - dst_copy_metrics(new, &ort->dst); + new->output = dst_discard_sk; new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); - rt->rt_key_dst = ort->rt_key_dst; - rt->rt_key_src = ort->rt_key_src; - rt->rt_key_tos = ort->rt_key_tos; - rt->rt_route_iif = ort->rt_route_iif; + rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; - rt->rt_oif = ort->rt_oif; - rt->rt_mark = ort->rt_mark; + rt->rt_pmtu = ort->rt_pmtu; - rt->rt_genid = rt_genid(net); + rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_dst = ort->rt_dst; - rt->rt_src = ort->rt_src; rt->rt_gateway = ort->rt_gateway; - rt->rt_spec_dst = ort->rt_spec_dst; - rt->peer = ort->peer; - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - rt->fi = ort->fi; - if (rt->fi) - atomic_inc(&rt->fi->fib_clntref); + rt->rt_uses_gateway = ort->rt_uses_gateway; + + INIT_LIST_HEAD(&rt->rt_uncached); dst_free(new); } @@ -2945,18 +2275,18 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, - struct sk_buff *skb, u32 pid, u32 seq, int event, - int nowait, unsigned int flags) +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, + struct flowi4 *fl4, struct sk_buff *skb, u32 portid, + u32 seq, int event, int nowait, unsigned int flags) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; - const struct inet_peer *peer = rt->peer; - u32 id = 0, ts = 0, tsage = 0, error; + u32 error; + u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2964,7 +2294,7 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->rt_key_tos; + r->rtm_tos = fl4->flowi4_tos; r->rtm_table = RT_TABLE_MAIN; if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) goto nla_put_failure; @@ -2975,11 +2305,11 @@ static int rt_fill_info(struct net *net, if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; - if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) + if (nla_put_be32(skb, RTA_DST, dst)) goto nla_put_failure; - if (rt->rt_key_src) { + if (src) { r->rtm_src_len = 32; - if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) + if (nla_put_be32(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && @@ -2990,49 +2320,43 @@ static int rt_fill_info(struct net *net, nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif - if (rt_is_input_route(rt)) { - if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) - goto nla_put_failure; - } else if (rt->rt_src != rt->rt_key_src) { - if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) + if (!rt_is_input_route(rt) && + fl4->saddr != src) { + if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_dst != rt->rt_gateway && + if (rt->rt_uses_gateway && nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; + + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + } + + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt_pmtu && expires) + metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (rt->rt_mark && - nla_put_be32(skb, RTA_MARK, rt->rt_mark)) + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; error = rt->dst.error; - if (peer) { - inet_peer_refcheck(rt->peer); - id = atomic_read(&peer->ip_id_count) & 0xffff; - if (peer->tcp_ts_stamp) { - ts = peer->tcp_ts; - tsage = get_seconds() - peer->tcp_ts_stamp; - } - expires = ACCESS_ONCE(peer->pmtu_expires); - if (expires) { - if (time_before(jiffies, expires)) - expires -= jiffies; - else - expires = 0; - } - } if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE - __be32 dst = rt->rt_dst; - if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, - rt->rt_src, rt->rt_dst, + fl4->saddr, fl4->daddr, r, nowait); if (err <= 0) { if (!nowait) { @@ -3047,12 +2371,11 @@ static int rt_fill_info(struct net *net, } } else #endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, - expires, error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -3062,12 +2385,13 @@ nla_put_failure: return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; + struct flowi4 fl4; __be32 dst = 0; __be32 src = 0; u32 iif; @@ -3102,6 +2426,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst; + fl4.saddr = src; + fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_mark = mark; + if (iif) { struct net_device *dev; @@ -3122,13 +2453,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - struct flowi4 fl4 = { - .daddr = dst, - .saddr = src, - .flowi4_tos = rtm->rtm_tos, - .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - .flowi4_mark = mark, - }; rt = ip_route_output_key(net, &fl4); err = 0; @@ -3143,12 +2467,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + err = rt_fill_info(net, dst, src, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; @@ -3157,76 +2482,33 @@ errout_free: goto errout; } -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct rtable *rt; - int h, s_h; - int idx, s_idx; - struct net *net; - - net = sock_net(skb->sk); - - s_h = cb->args[0]; - if (s_h < 0) - s_h = 0; - s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { - if (!rt_hash_table[h].chain) - continue; - rcu_read_lock_bh(); - for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) - continue; - if (rt_is_expired(rt)) - continue; - skb_dst_set_noref(skb, &rt->dst); - if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) <= 0) { - skb_dst_drop(skb); - rcu_read_unlock_bh(); - goto done; - } - skb_dst_drop(skb); - } - rcu_read_unlock_bh(); - } - -done: - cb->args[0] = h; - cb->args[1] = idx; - return skb->len; -} - void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(dev_net(in_dev->dev), 0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL -static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_gc_elasticity __read_mostly = 8; + +static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write) { - int flush_delay; - ctl_table ctl; - struct net *net; + struct net *net = (struct net *)__ctl->extra1; - memcpy(&ctl, __ctl, sizeof(ctl)); - ctl.data = &flush_delay; - proc_dointvec(&ctl, write, buffer, lenp, ppos); - - net = (struct net *)__ctl->extra1; - rt_cache_flush(net, flush_delay); + if (write) { + rt_cache_flush(net); + fnhe_genid_bump(net); return 0; } return -EINVAL; } -static ctl_table ipv4_route_table[] = { +static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, @@ -3356,6 +2638,10 @@ static __net_init int sysctl_route_net_init(struct net *net) tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; } tbl[0].extra1 = net; @@ -3389,8 +2675,8 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - get_random_bytes(&net->ipv4.rt_genid, - sizeof(net->ipv4.rt_genid)); + atomic_set(&net->ipv4.rt_genid, 0); + atomic_set(&net->fnhe_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; @@ -3400,31 +2686,45 @@ static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; +static int __net_init ipv4_inetpeer_init(struct net *net) +{ + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); -#ifdef CONFIG_IP_ROUTE_CLASSID -struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_IP_ROUTE_CLASSID */ + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv4.peers = bp; + return 0; +} -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) +static void __net_exit ipv4_inetpeer_exit(struct net *net) { - ssize_t ret; + struct inet_peer_base *bp = net->ipv4.peers; - if (!str) - return 0; + net->ipv4.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); +} - ret = kstrtoul(str, 0, &rhash_entries); - if (ret) - return 0; +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { + .init = ipv4_inetpeer_init, + .exit = ipv4_inetpeer_exit, +}; - return 1; -} -__setup("rhash_entries=", set_rhash_entries); +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) @@ -3443,36 +2743,17 @@ int __init ip_rt_init(void) if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); - rt_hash_table = (struct rt_hash_bucket *) - alloc_large_system_hash("IP route cache", - sizeof(struct rt_hash_bucket), - rhash_entries, - (totalram_pages >= 128 * 1024) ? - 15 : 17, - 0, - &rt_hash_log, - &rt_hash_mask, - 0, - rhash_entries ? 0 : 512 * 1024); - memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); - rt_hash_lock_init(); - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + ipv4_dst_ops.gc_thresh = ~0; + ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); - xfrm4_init(ip_rt_max_size); + xfrm4_init(); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); @@ -3480,6 +2761,7 @@ int __init ip_rt_init(void) register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&rt_genid_ops); + register_pernet_subsys(&ipv4_inetpeer_ops); return rc; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eab2a7fb15d..c86624b36a6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -25,15 +25,7 @@ extern int sysctl_tcp_syncookies; -__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; -EXPORT_SYMBOL(syncookie_secret); - -static __init int init_syncookies(void) -{ - get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); - return 0; -} -__initcall(init_syncookies); +static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -44,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); + __u32 *tmp; + + net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); + tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; tmp[1] = (__force u32)daddr; @@ -89,8 +84,7 @@ __u32 cookie_init_timestamp(struct request_sock *req) static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, - __be16 dport, __u32 sseq, __u32 count, - __u32 data) + __be16 dport, __u32 sseq, __u32 data) { /* * Compute the secure sequence number. @@ -102,7 +96,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ - + u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) @@ -114,22 +108,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * - * The count value used to generate the cookie must be within - * "maxdiff" if the current (passed-in) "count". The return value - * is (__u32)-1 if this test fails. + * The count value used to generate the cookie must be less than + * MAX_SYNCOOKIE_AGE minutes in the past. + * The return value (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, - __be16 sport, __be16 dport, __u32 sseq, - __u32 count, __u32 maxdiff) + __be16 sport, __be16 dport, __u32 sseq) { - __u32 diff; + u32 diff, count = tcp_cookie_time(); /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ - diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); - if (diff >= maxdiff) + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); + if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - @@ -138,72 +131,70 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, } /* - * MSS Values are taken from the 2009 paper - * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: - * - values 1440 to 1460 accounted for 80% of observed mss values - * - values outside the 536-1460 range are rare (<0.2%). + * MSS Values are chosen based on the 2011 paper + * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. + * Values .. + * .. lower than 536 are rare (< 0.2%) + * .. between 537 and 1299 account for less than < 1.5% of observed values + * .. in the 1300-1349 range account for about 15 to 20% of observed mss values + * .. exceeding 1460 are very rare (< 0.04%) * - * Table must be sorted. + * 1460 is the single most frequently announced mss value (30 to 46% depending + * on monitor location). Table must be sorted. */ static __u16 const msstab[] = { - 64, - 512, 536, - 1024, - 1440, + 1300, + 1440, /* 1440, 1452: PPPoE */ 1460, - 4312, - 8960, }; /* * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, + u16 *mssp) { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), - jiffies / (HZ * 60), mssind); + mssind); +} +EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); + +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v4_init_sequence(iph, th, mssp); } -/* - * This (misnamed) value is the age of syncookie which is permitted. - * Its ideal value should be dependent on TCP_TIMEOUT_INIT and - * sysctl_tcp_retries1. It's a rather complicated formula (exponential - * backoff) to compute at runtime so it's currently hardcoded here. - */ -#define COUNTER_TRIES 4 /* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ -static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, + u32 cookie) { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, - th->source, th->dest, seq, - jiffies / (HZ * 60), - COUNTER_TRIES); + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v4_check); static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -232,7 +223,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * * return false if we decode an option that should not be. */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) +bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, + struct net *net, bool *ecn_ok) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr & TSMASK; @@ -247,7 +239,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; *ecn_ok = (options >> 5) & 1; - if (*ecn_ok && !sysctl_tcp_ecn) + if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) return false; if (tcp_opt->sack_ok && !sysctl_tcp_sack) @@ -266,7 +258,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { struct tcp_options_received tcp_opt; - const u8 *hash_location; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -284,7 +275,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } @@ -293,9 +284,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) goto out; ret = NULL; @@ -308,10 +299,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; req->mss = mss; - ireq->loc_port = th->dest; - ireq->rmt_port = th->source; - ireq->loc_addr = ip_hdr(skb)->daddr; - ireq->rmt_addr = ip_hdr(skb)->saddr; + ireq->ir_num = ntohs(th->dest); + ireq->ir_rmt_port = th->source; + ireq->ir_loc_addr = ip_hdr(skb)->daddr; + ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->ir_mark = inet_request_mark(sk, skb); ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -319,6 +311,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->listener = NULL; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) @@ -339,7 +332,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } req->expires = 0UL; - req->retrans = 0; + req->num_retrans = 0; /* * We need to lookup the route here to get at the correct @@ -347,11 +340,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), - RT_SCOPE_UNIVERSE, IPPROTO_TCP, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, - ireq->loc_addr, th->source, th->dest); + (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, + ireq->ir_loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef32956ed65..79a007c5255 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -27,7 +27,9 @@ #include <net/tcp_memcontrol.h> static int zero; -static int two = 2; +static int one = 1; +static int four = 4; +static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -35,26 +37,30 @@ static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int ip_ttl_min = 1; static int ip_ttl_max = 255; +static int tcp_syn_retries_min = 1; +static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ -static void set_local_port_range(int range[2]) +static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&sysctl_local_ports.lock); - sysctl_local_ports.range[0] = range[0]; - sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&sysctl_local_ports.lock); + write_seqlock(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = range[0]; + net->ipv4.ip_local_ports.range[1] = range[1]; + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ -static int ipv4_local_port_range(ctl_table *table, int write, +static int ipv4_local_port_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = + container_of(table->data, struct net, ipv4.ip_local_ports.range); int ret; int range[2]; - ctl_table tmp = { + struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, @@ -62,71 +68,88 @@ static int ipv4_local_port_range(ctl_table *table, int write, .extra2 = &ip_local_port_range_max, }; - inet_get_local_port_range(range, range + 1); + inet_get_local_port_range(net, &range[0], &range[1]); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) ret = -EINVAL; else - set_local_port_range(range); + set_local_port_range(net, range); } return ret; } -static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) +static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) { - gid_t *data = table->data; + kgid_t *data = table->data; + struct net *net = + container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } /* Update system visible IP port range */ -static void set_ping_group_range(struct ctl_table *table, gid_t range[2]) +static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) { - gid_t *data = table->data; - write_seqlock(&sysctl_local_ports.lock); - data[0] = range[0]; - data[1] = range[1]; - write_sequnlock(&sysctl_local_ports.lock); + kgid_t *data = table->data; + struct net *net = + container_of(table->data, struct net, ipv4.ping_group_range.range); + write_seqlock(&net->ipv4.ip_local_ports.lock); + data[0] = low; + data[1] = high; + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ -static int ipv4_ping_group_range(ctl_table *table, int write, +static int ipv4_ping_group_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct user_namespace *user_ns = current_user_ns(); int ret; - gid_t range[2]; - ctl_table tmp = { - .data = &range, - .maxlen = sizeof(range), + gid_t urange[2]; + kgid_t low, high; + struct ctl_table tmp = { + .data = &urange, + .maxlen = sizeof(urange), .mode = table->mode, .extra1 = &ip_ping_group_range_min, .extra2 = &ip_ping_group_range_max, }; - inet_get_ping_group_range_table(table, range, range + 1); + inet_get_ping_group_range_table(table, &low, &high); + urange[0] = from_kgid_munged(user_ns, low); + urange[1] = from_kgid_munged(user_ns, high); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && ret == 0) - set_ping_group_range(table, range); + if (write && ret == 0) { + low = make_kgid(user_ns, urange[0]); + high = make_kgid(user_ns, urange[1]); + if (!gid_valid(low) || !gid_valid(high) || + (urange[1] < urange[0]) || gid_lt(high, low)) { + low = make_kgid(&init_user_ns, 1); + high = make_kgid(&init_user_ns, 0); + } + set_ping_group_range(table, low, high); + } return ret; } -static int proc_tcp_congestion_control(ctl_table *ctl, int write, +static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char val[TCP_CA_NAME_MAX]; - ctl_table tbl = { + struct ctl_table tbl = { .data = val, .maxlen = TCP_CA_NAME_MAX, }; @@ -140,12 +163,12 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, return ret; } -static int proc_tcp_available_congestion_control(ctl_table *ctl, +static int proc_tcp_available_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; + struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); @@ -157,12 +180,12 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl, return ret; } -static int proc_allowed_congestion_control(ctl_table *ctl, +static int proc_allowed_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; + struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); @@ -177,47 +200,51 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } -static int ipv4_tcp_mem(ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { + struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; + struct tcp_fastopen_context *ctxt; int ret; - unsigned long vec[3]; - struct net *net = current->nsproxy->net_ns; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM - struct mem_cgroup *memcg; -#endif + u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ - ctl_table tmp = { - .data = &vec, - .maxlen = sizeof(vec), - .mode = ctl->mode, - }; - - if (!write) { - ctl->data = &net->ipv4.sysctl_tcp_mem; - return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); - } - - ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); - if (ret) - return ret; + tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); + if (!tbl.data) + return -ENOMEM; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - - tcp_prot_mem(memcg, vec[0], 0); - tcp_prot_mem(memcg, vec[1], 1); - tcp_prot_mem(memcg, vec[2], 2); + ctxt = rcu_dereference(tcp_fastopen_ctx); + if (ctxt) + memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + else + memset(user_key, 0, sizeof(user_key)); rcu_read_unlock(); -#endif - net->ipv4.sysctl_tcp_mem[0] = vec[0]; - net->ipv4.sysctl_tcp_mem[1] = vec[1]; - net->ipv4.sysctl_tcp_mem[2] = vec[2]; + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", + user_key[0], user_key[1], user_key[2], user_key[3]); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - return 0; + if (write && ret == 0) { + if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, + user_key + 2, user_key + 3) != 4) { + ret = -EINVAL; + goto bad_key; + } + /* Generate a dummy secret but don't publish it. This + * is needed so we don't regenerate a new key on the + * first invocation of tcp_fastopen_cookie_gen + */ + tcp_fastopen_init_key_once(false); + tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + } + +bad_key: + pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", + user_key[0], user_key[1], user_key[2], user_key[3], + (char *)tbl.data, ret); + kfree(tbl.data); + return ret; } static struct ctl_table ipv4_table[] = { @@ -259,13 +286,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &ip_ttl_max, }, { - .procname = "ip_no_pmtu_disc", - .data = &ipv4_config.no_pmtu_disc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -277,7 +297,9 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_syn_retries, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_syn_retries_min, + .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", @@ -301,6 +323,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_early_demux", + .data = &sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, .maxlen = sizeof(int), @@ -360,6 +389,19 @@ static struct ctl_table ipv4_table[] = { }, #endif { + .procname = "tcp_fastopen", + .data = &sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, + { .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), @@ -395,20 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_local_port_range", - .data = &sysctl_local_ports.range, - .maxlen = sizeof(sysctl_local_ports.range), - .mode = 0644, - .proc_handler = ipv4_local_port_range, - }, - { - .procname = "ip_local_reserved_ports", - .data = NULL, /* initialized in sysctl_ipv4_init */ - .maxlen = 65536, - .mode = 0644, - .proc_handler = proc_do_large_bitmap, - }, - { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, .maxlen = sizeof(int), @@ -465,32 +493,41 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_ecn", - .data = &sysctl_tcp_ecn, + .procname = "tcp_dsack", + .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { - .procname = "tcp_dsack", - .data = &sysctl_tcp_dsack, - .maxlen = sizeof(int), + .procname = "tcp_mem", + .maxlen = sizeof(sysctl_tcp_mem), + .data = &sysctl_tcp_mem, .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_notsent_lowat", + .data = &sysctl_tcp_notsent_lowat, + .maxlen = sizeof(sysctl_tcp_notsent_lowat), + .mode = 0644, + .proc_handler = proc_dointvec, }, { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "tcp_app_win", @@ -523,13 +560,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_frto_response", - .data = &sysctl_tcp_frto_response, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), @@ -564,13 +594,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_congestion_control, }, { - .procname = "tcp_abc", - .data = &sysctl_tcp_abc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_mtu_probing", .data = &sysctl_tcp_mtu_probing, .maxlen = sizeof(int), @@ -591,6 +614,20 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_challenge_ack_limit", + .data = &sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", @@ -650,27 +687,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_allowed_congestion_control, }, { - .procname = "tcp_max_ssthresh", - .data = &sysctl_tcp_max_ssthresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_cookie_size", - .data = &sysctl_tcp_cookie_size, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_thin_linear_timeouts", .data = &sysctl_tcp_thin_linear_timeouts, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { + { .procname = "tcp_thin_dupack", .data = &sysctl_tcp_thin_dupack, .maxlen = sizeof(int), @@ -684,7 +707,25 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, - .extra2 = &two, + .extra2 = &four, + }, + { + .procname = "tcp_min_tso_segs", + .data = &sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &gso_max_segs, + }, + { + .procname = "tcp_autocorking", + .data = &sysctl_tcp_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, { .procname = "udp_mem", @@ -699,7 +740,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = &one }, { .procname = "udp_wmem_min", @@ -707,7 +748,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = &one }, { } }; @@ -756,24 +797,60 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .procname = "rt_cache_rebuild_count", - .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, + .procname = "ping_group_range", + .data = &init_net.ipv4.ping_group_range.range, + .maxlen = sizeof(gid_t)*2, + .mode = 0644, + .proc_handler = ipv4_ping_group_range, + }, + { + .procname = "tcp_ecn", + .data = &init_net.ipv4.sysctl_tcp_ecn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_local_port_range", + .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), + .data = &init_net.ipv4.ip_local_ports.range, + .mode = 0644, + .proc_handler = ipv4_local_port_range, + }, + { + .procname = "ip_local_reserved_ports", + .data = &init_net.ipv4.sysctl_local_reserved_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, + { + .procname = "ip_no_pmtu_disc", + .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { - .procname = "ping_group_range", - .data = &init_net.ipv4.sysctl_ping_group_range, - .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), + .procname = "ip_forward_use_pmtu", + .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = ipv4_ping_group_range, + .proc_handler = proc_dointvec, }, { - .procname = "tcp_mem", - .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), + .procname = "fwmark_reflect", + .data = &init_net.ipv4.sysctl_fwmark_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fwmark_accept", + .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = ipv4_tcp_mem, + .proc_handler = proc_dointvec, }, { } }; @@ -784,46 +861,29 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table = ipv4_net_table; if (!net_eq(net, &init_net)) { + int i; + table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (table == NULL) goto err_alloc; - table[0].data = - &net->ipv4.sysctl_icmp_echo_ignore_all; - table[1].data = - &net->ipv4.sysctl_icmp_echo_ignore_broadcasts; - table[2].data = - &net->ipv4.sysctl_icmp_ignore_bogus_error_responses; - table[3].data = - &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; - table[4].data = - &net->ipv4.sysctl_icmp_ratelimit; - table[5].data = - &net->ipv4.sysctl_icmp_ratemask; - table[6].data = - &net->ipv4.sysctl_rt_cache_rebuild_count; - table[7].data = - &net->ipv4.sysctl_ping_group_range; - + /* Update the variables to point into the current struct net */ + for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; } - /* - * Sane defaults - nobody may create ping sockets. - * Boot scripts should set this to distro-specific group. - */ - net->ipv4.sysctl_ping_group_range[0] = 1; - net->ipv4.sysctl_ping_group_range[1] = 0; - - net->ipv4.sysctl_rt_cache_rebuild_count = 4; - - tcp_init_mem(net); - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; + net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_reserved_ports) + goto err_ports; + return 0; +err_ports: + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); @@ -835,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); @@ -848,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; - struct ctl_table *i; - - for (i = ipv4_table; i->procname; i++) { - if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { - i->data = sysctl_local_reserved_ports; - break; - } - } - if (!i->procname) - return -EINVAL; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ba605f60e4..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include <linux/slab.h> #include <net/icmp.h> +#include <net/inet_common.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/ip.h> @@ -278,15 +279,22 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <net/busy_poll.h> int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +int sysctl_tcp_min_tso_segs __read_mostly = 2; + +int sysctl_tcp_autocorking __read_mostly = 1; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); @@ -373,12 +381,13 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); + __skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); + INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -398,6 +407,8 @@ void tcp_init_sock(struct sock *sk) tcp_enable_early_retrans(tp); icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tp->tsoffset = 0; + sk->sk_state = TCP_CLOSE; sk->sk_write_space = sk_stream_write_space; @@ -405,19 +416,6 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - /* TCP Cookie Transactions */ - if (sysctl_tcp_cookie_size > 0) { - /* Default, cookies without s_data_payload. */ - tp->cookie_values = - kzalloc(sizeof(*tp->cookie_values), - sk->sk_allocation); - if (tp->cookie_values != NULL) - kref_init(&tp->cookie_values->kref); - } - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -441,6 +439,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + sock_rps_record_flow(sk); + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -484,8 +484,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; - /* Connected? */ - if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Connected or passive Fast Open socket? */ + if (sk->sk_state != TCP_SYN_SENT && + (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -500,7 +501,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, @@ -511,7 +512,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * wspace test but before the flags are set, * IO signal will be lost. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } } else @@ -533,30 +534,29 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct tcp_sock *tp = tcp_sk(sk); int answ; + bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; - lock_sock(sk); + slow = lock_sock_fast(sk); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { - struct sk_buff *skb; answ = tp->rcv_nxt - tp->copied_seq; - /* Subtract 1, if FIN is in queue. */ - skb = skb_peek_tail(&sk->sk_receive_queue); - if (answ && skb) - answ -= tcp_hdr(skb)->fin; + /* Subtract 1, if FIN was received */ + if (answ && sock_flag(sk, SOCK_DONE)) + answ--; } else answ = tp->urg_seq - tp->copied_seq; - release_sock(sk); + unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = tp->urg_data && tp->urg_seq == tp->copied_seq; @@ -621,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) tp->snd_up = tp->write_seq; } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, + int size_goal) { - if (tcp_send_head(sk)) { - struct tcp_sock *tp = tcp_sk(sk); + return skb->len < size_goal && + sysctl_tcp_autocorking && + skb != tcp_write_queue_head(sk) && + atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!tcp_send_head(sk)) + return; + + skb = tcp_write_queue_tail(sk); + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); + + tcp_mark_urg(tp, flags); - if (!(flags & MSG_MORE) || forced_push(tp)) - tcp_mark_push(tp, tcp_write_queue_tail(sk)); + if (tcp_should_autocork(sk, skb, size_goal)) { - tcp_mark_urg(tp, flags); - __tcp_push_pending_frames(sk, mss_now, - (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + /* avoid atomic op if TSQ_THROTTLED bit is already set */ + if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + } + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. + */ + if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) + return; } + + if (flags & MSG_MORE) + nonagle = TCP_NAGLE_CORK; + + __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -771,7 +810,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb->avail_size = size; + skb->reserved_tailroom = skb->end - skb->tail - size; return skb; } __kfree_skb(skb); @@ -791,10 +830,24 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, xmit_size_goal = mss_now; if (large_allowed && sk_can_gso(sk)) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); + u32 gso_size, hlen; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); + gso_size = max_t(u32, gso_size, + sysctl_tcp_min_tso_segs * mss_now); + + xmit_size_goal = min_t(u32, gso_size, + sk->sk_gso_max_size - 1 - hlen); xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); @@ -805,7 +858,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, old_size_goal + mss_now > xmit_size_goal)) { xmit_size_goal = old_size_goal; } else { - tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + tp->xmit_size_goal_segs = + min_t(u16, xmit_size_goal / mss_now, + sk->sk_gso_max_segs); xmit_size_goal = tp->xmit_size_goal_segs * mss_now; } } @@ -823,8 +878,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } -static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, - size_t psize, int flags) +static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, + size_t size, int flags) { struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; @@ -832,10 +887,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + /* Wait for a connection to finish. One exception is TCP Fast Open + * (passive side) where data is allowed to be sent before a connection + * is fully established. + */ + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && + !tcp_passive_fastopen(sk)) { if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err; + } clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -846,12 +906,9 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; - while (psize > 0) { + while (size > 0) { struct sk_buff *skb = tcp_write_queue_tail(sk); - struct page *page = pages[poffset / PAGE_SIZE]; int copy, i; - int offset = poffset % PAGE_SIZE; - int size = min_t(size_t, psize, PAGE_SIZE - offset); bool can_coalesce; if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { @@ -885,6 +942,7 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; @@ -900,8 +958,8 @@ new_segment: TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; copied += copy; - poffset += copy; - if (!(psize -= copy)) + offset += copy; + if (!(size -= copy)) goto out; if (skb->len < size_goal || (flags & MSG_OOB)) @@ -917,7 +975,8 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -927,7 +986,7 @@ wait_for_memory: out: if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); return copied; do_error: @@ -948,7 +1007,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); - res = do_tcp_sendpages(sk, &page, offset, size, flags); + res = do_tcp_sendpages(sk, page, offset, size, flags); release_sock(sk); return res; } @@ -977,31 +1036,79 @@ static inline int select_size(const struct sock *sk, bool sg) return tmp; } +void tcp_free_fastopen_req(struct tcp_sock *tp) +{ + if (tp->fastopen_req != NULL) { + kfree(tp->fastopen_req); + tp->fastopen_req = NULL; + } +} + +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, + int *copied, size_t size) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err, flags; + + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + return -EOPNOTSUPP; + if (tp->fastopen_req != NULL) + return -EALREADY; /* Another Fast Open is in progress */ + + tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), + sk->sk_allocation); + if (unlikely(tp->fastopen_req == NULL)) + return -ENOBUFS; + tp->fastopen_req->data = msg; + tp->fastopen_req->size = size; + + flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; + err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, flags); + *copied = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + return err; +} + int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int iovlen, flags, err, copied; - int mss_now = 0, size_goal; + int iovlen, flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0, offset = 0; bool sg; long timeo; lock_sock(sk); flags = msg->msg_flags; + if (flags & MSG_FASTOPEN) { + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); + if (err == -EINPROGRESS && copied_syn > 0) + goto out; + else if (err) + goto out_err; + offset = copied_syn; + } + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + /* Wait for a connection to finish. One exception is TCP Fast Open + * (passive side) where data is allowed to be sent before a connection + * is fully established. + */ + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && + !tcp_passive_fastopen(sk)) { if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; + goto do_error; + } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); - goto out; + goto out_nopush; } err = -EINVAL; @@ -1032,6 +1139,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, unsigned char __user *from = iov->iov_base; iov++; + if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ + if (offset >= seglen) { + offset -= seglen; + continue; + } + seglen -= offset; + from += offset; + offset = 0; + } while (seglen > 0) { int copy = 0; @@ -1059,6 +1175,13 @@ new_segment: goto wait_for_memory; /* + * All packets are restored as if they have + * already been sent. + */ + if (tp->repair) + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) @@ -1081,78 +1204,43 @@ new_segment: if (err) goto do_fault; } else { - bool merge = false; + bool merge = true; int i = skb_shinfo(skb)->nr_frags; - struct page *page = sk->sk_sndmsg_page; - int off; - - if (page && page_count(page) == 1) - sk->sk_sndmsg_off = 0; - - off = sk->sk_sndmsg_off; - - if (skb_can_coalesce(skb, i, page, off) && - off != PAGE_SIZE) { - /* We can extend the last page - * fragment. */ - merge = true; - } else if (i == MAX_SKB_FRAGS || !sg) { - /* Need to add new fragment and cannot - * do this because interface is non-SG, - * or because all the page slots are - * busy. */ - tcp_mark_push(tp, skb); - goto new_segment; - } else if (page) { - if (off == PAGE_SIZE) { - put_page(page); - sk->sk_sndmsg_page = page = NULL; - off = 0; + struct page_frag *pfrag = sk_page_frag(sk); + + if (!sk_page_frag_refill(sk, pfrag)) + goto wait_for_memory; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + if (i == MAX_SKB_FRAGS || !sg) { + tcp_mark_push(tp, skb); + goto new_segment; } - } else - off = 0; + merge = false; + } - if (copy > PAGE_SIZE - off) - copy = PAGE_SIZE - off; + copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; - if (!page) { - /* Allocate new cache page. */ - if (!(page = sk_stream_alloc_page(sk))) - goto wait_for_memory; - } - - /* Time to copy data. We are close to - * the end! */ err = skb_copy_to_page_nocache(sk, from, skb, - page, off, copy); - if (err) { - /* If this page was new, give it to the - * socket so it does not get leaked. - */ - if (!sk->sk_sndmsg_page) { - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; - } + pfrag->page, + pfrag->offset, + copy); + if (err) goto do_error; - } /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { - skb_fill_page_desc(skb, i, page, off, copy); - if (sk->sk_sndmsg_page) { - get_page(page); - } else if (off + copy < PAGE_SIZE) { - get_page(page); - sk->sk_sndmsg_page = page; - } + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); } - - sk->sk_sndmsg_off = off + copy; + pfrag->offset += copy; } if (!copied) @@ -1180,8 +1268,9 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - if (copied && likely(!tp->repair)) - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + if (copied) + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -1191,10 +1280,11 @@ wait_for_memory: } out: - if (copied && likely(!tp->repair)) - tcp_push(sk, flags, mss_now, tp->nonagle); + if (copied) + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush: release_sock(sk); - return copied; + return copied + copied_syn; do_fault: if (!skb->len) { @@ -1207,7 +1297,7 @@ do_fault: } do_error: - if (copied) + if (copied + copied_syn) goto out; out_err: err = sk_stream_error(sk, flags, err); @@ -1376,12 +1466,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) return; last_issued = tp->ucopy.dma_cookie; - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + dma_async_issue_pending(tp->ucopy.dma_chan); do { - if (dma_async_memcpy_complete(tp->ucopy.dma_chan, + if (dma_async_is_tx_complete(tp->ucopy.dma_chan, last_issued, &done, - &used) == DMA_SUCCESS) { + &used) == DMA_COMPLETE) { /* Safe to free early-copied skbs now */ __skb_queue_purge(&sk->sk_async_wait_queue); break; @@ -1389,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_async_wait_queue)) && (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { + used) == DMA_COMPLETE)) { __skb_dequeue(&sk->sk_async_wait_queue); kfree_skb(skb); } @@ -1398,12 +1488,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) } #endif -static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; - skb_queue_walk(&sk->sk_receive_queue, skb) { + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (tcp_hdr(skb)->syn) offset--; @@ -1411,6 +1501,11 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) *off = offset; return skb; } + /* This looks weird, but this can happen if TCP collapsing + * splitted a fat GRO packet, while we released socket lock + * in skb_splice_bits() + */ + sk_eat_skb(sk, skb, false); } return NULL; } @@ -1452,7 +1547,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } used = recv_actor(desc, skb, offset, len); - if (used < 0) { + if (used <= 0) { if (!copied) copied = used; break; @@ -1461,15 +1556,19 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, copied += used; offset += used; } - /* - * If recv_actor drops the lock (e.g. TCP splice + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ - skb = tcp_recv_skb(sk, seq-1, &offset); - if (!skb || (offset+1 != skb->len)) + skb = tcp_recv_skb(sk, seq - 1, &offset); + if (!skb) break; + /* TCP coalescing might have appended data to the skb. + * Try to splice more frags + */ + if (offset + 1 != skb->len) + continue; } if (tcp_hdr(skb)->fin) { sk_eat_skb(sk, skb, false); @@ -1486,8 +1585,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ - if (copied > 0) + if (copied > 0) { + tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); + } return copied; } EXPORT_SYMBOL(tcp_read_sock); @@ -1516,6 +1617,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + (sk->sk_state == TCP_ESTABLISHED)) + sk_busy_loop(sk, nonblock); + lock_sock(sk); err = -ENOTCONN; @@ -1564,11 +1669,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && net_dma_find_channel()) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif @@ -1704,8 +1809,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } #ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + if (tp->ucopy.dma_chan) { + if (tp->rcv_wnd == 0 && + !skb_queue_empty(&sk->sk_async_wait_queue)) { + tcp_service_net_dma(sk, true); + tcp_cleanup_rbuf(sk, copied); + } else + dma_async_issue_pending(tp->ucopy.dma_chan); + } #endif if (copied >= target) { /* Do not sleep, just process backlog. */ @@ -1797,7 +1908,7 @@ do_prequeue: break; } - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + dma_async_issue_pending(tp->ucopy.dma_chan); if ((offset + used) == skb->len) copied_early = true; @@ -2086,6 +2197,10 @@ void tcp_close(struct sock *sk, long timeout) * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK + * XXX (TFO) - To start off we don't support SYN+ACK+FIN + * in a single packet! (May consider it later but will + * probably need API support or TCP_CORK SYN-ACK until + * data is written and socket is closed.) */ tcp_send_fin(sk); } @@ -2117,7 +2232,7 @@ adjudge_to_death: /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. @@ -2157,8 +2272,16 @@ adjudge_to_death: } } - if (sk->sk_state == TCP_CLOSE) + if (sk->sk_state == TCP_CLOSE) { + struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + /* We could get here with a non-NULL req if the socket is + * aborted (e.g., closed with unread data) before 3WHS + * finishes. + */ + if (req != NULL) + reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); + } /* Otherwise, socket is reprieved until protocol close. */ out: @@ -2219,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); - tp->srtt = 0; + tp->srtt_us = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; @@ -2228,7 +2351,6 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); @@ -2244,9 +2366,16 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); +void tcp_sock_destruct(struct sock *sk) +{ + inet_sock_destruct(sk); + + kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); +} + static inline bool tcp_can_repair_sock(const struct sock *sk) { - return capable(CAP_NET_ADMIN) && + return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } @@ -2267,10 +2396,17 @@ static int tcp_repair_options_est(struct tcp_sock *tp, tp->rx_opt.mss_clamp = opt.opt_val; break; case TCPOPT_WINDOW: - if (opt.opt_val > 14) - return -EFBIG; + { + u16 snd_wscale = opt.opt_val & 0xFFFF; + u16 rcv_wscale = opt.opt_val >> 16; + + if (snd_wscale > 14 || rcv_wscale > 14) + return -EFBIG; - tp->rx_opt.snd_wscale = opt.opt_val; + tp->rx_opt.snd_wscale = snd_wscale; + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rx_opt.wscale_ok = 1; + } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) @@ -2322,92 +2458,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } - case TCP_COOKIE_TRANSACTIONS: { - struct tcp_cookie_transactions ctd; - struct tcp_cookie_values *cvp = NULL; - - if (sizeof(ctd) > optlen) - return -EINVAL; - if (copy_from_user(&ctd, optval, sizeof(ctd))) - return -EFAULT; - - if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || - ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) - return -EINVAL; - - if (ctd.tcpct_cookie_desired == 0) { - /* default to global value */ - } else if ((0x1 & ctd.tcpct_cookie_desired) || - ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || - ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { - return -EINVAL; - } - - if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { - /* Supercedes all other values */ - lock_sock(sk); - if (tp->cookie_values != NULL) { - kref_put(&tp->cookie_values->kref, - tcp_cookie_values_release); - tp->cookie_values = NULL; - } - tp->rx_opt.cookie_in_always = 0; /* false */ - tp->rx_opt.cookie_out_never = 1; /* true */ - release_sock(sk); - return err; - } - - /* Allocate ancillary memory before locking. - */ - if (ctd.tcpct_used > 0 || - (tp->cookie_values == NULL && - (sysctl_tcp_cookie_size > 0 || - ctd.tcpct_cookie_desired > 0 || - ctd.tcpct_s_data_desired > 0))) { - cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, - GFP_KERNEL); - if (cvp == NULL) - return -ENOMEM; - - kref_init(&cvp->kref); - } - lock_sock(sk); - tp->rx_opt.cookie_in_always = - (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); - tp->rx_opt.cookie_out_never = 0; /* false */ - - if (tp->cookie_values != NULL) { - if (cvp != NULL) { - /* Changed values are recorded by a changed - * pointer, ensuring the cookie will differ, - * without separately hashing each value later. - */ - kref_put(&tp->cookie_values->kref, - tcp_cookie_values_release); - } else { - cvp = tp->cookie_values; - } - } - - if (cvp != NULL) { - cvp->cookie_desired = ctd.tcpct_cookie_desired; - - if (ctd.tcpct_used > 0) { - memcpy(cvp->s_data_payload, ctd.tcpct_value, - ctd.tcpct_used); - cvp->s_data_desired = ctd.tcpct_used; - cvp->s_data_constant = 1; /* true */ - } else { - /* No constant payload data. */ - cvp->s_data_desired = ctd.tcpct_s_data_desired; - cvp->s_data_constant = 0; /* false */ - } - - tp->cookie_values = cvp; - } - release_sock(sk); - return err; - } default: /* fallthru */ break; @@ -2460,10 +2510,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else + else { tp->thin_dupack = val; if (tp->thin_dupack) tcp_disable_early_retrans(tp); + } break; case TCP_REPAIR: @@ -2625,7 +2676,28 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Cap the max timeout in ms TCP will retry/retrans * before giving up and aborting (ETIMEDOUT) a connection. */ - icsk->icsk_user_timeout = msecs_to_jiffies(val); + if (val < 0) + err = -EINVAL; + else + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; + + case TCP_FASTOPEN: + if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | + TCPF_LISTEN))) + err = fastopen_init_queue(sk, val); + else + err = -EINVAL; + break; + case TCP_TIMESTAMP: + if (!tp->repair) + err = -EPERM; + else + tp->tsoffset = val - tcp_time_stamp; + break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); break; default: err = -ENOPROTOOPT; @@ -2689,6 +2761,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); @@ -2712,8 +2786,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; - info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; - info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_rtt = tp->srtt_us >> 3; + info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; @@ -2723,6 +2797,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + + info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? + sk->sk_pacing_rate : ~0ULL; + info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? + sk->sk_max_pacing_rate : ~0ULL; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2808,41 +2887,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; - case TCP_COOKIE_TRANSACTIONS: { - struct tcp_cookie_transactions ctd; - struct tcp_cookie_values *cvp = tp->cookie_values; - - if (get_user(len, optlen)) - return -EFAULT; - if (len < sizeof(ctd)) - return -EINVAL; - - memset(&ctd, 0, sizeof(ctd)); - ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? - TCP_COOKIE_IN_ALWAYS : 0) - | (tp->rx_opt.cookie_out_never ? - TCP_COOKIE_OUT_NEVER : 0); - - if (cvp != NULL) { - ctd.tcpct_flags |= (cvp->s_data_in ? - TCP_S_DATA_IN : 0) - | (cvp->s_data_out ? - TCP_S_DATA_OUT : 0); - - ctd.tcpct_cookie_desired = cvp->cookie_desired; - ctd.tcpct_s_data_desired = cvp->s_data_desired; - - memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], - cvp->cookie_pair_size); - ctd.tcpct_used = cvp->cookie_pair_size; - } - - if (put_user(sizeof(ctd), optlen)) - return -EFAULT; - if (copy_to_user(optval, &ctd, sizeof(ctd))) - return -EFAULT; - return 0; - } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; @@ -2873,6 +2917,20 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; + + case TCP_FASTOPEN: + if (icsk->icsk_accept_queue.fastopenq != NULL) + val = icsk->icsk_accept_queue.fastopenq->max_qlen; + else + val = 0; + break; + + case TCP_TIMESTAMP: + val = tcp_time_stamp + tp->tsoffset; + break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; + break; default: return -ENOPROTOOPT; } @@ -2908,212 +2966,9 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct tcphdr *th; - unsigned int thlen; - unsigned int seq; - __be32 delta; - unsigned int oldlen; - unsigned int mss; - - if (!pskb_may_pull(skb, sizeof(*th))) - goto out; - - th = tcp_hdr(skb); - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - if (!pskb_may_pull(skb, thlen)) - goto out; - - oldlen = (u16)~skb->len; - __skb_pull(skb, thlen); - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - 0) || - !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) - goto out; - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - segs = skb_segment(skb, features); - if (IS_ERR(segs)) - goto out; - - delta = htonl(oldlen + (thlen + mss)); - - skb = segs; - th = tcp_hdr(skb); - seq = ntohl(th->seq); - - do { - th->fin = th->psh = 0; - - th->check = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = - csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); - - seq += mss; - skb = skb->next; - th = tcp_hdr(skb); - - th->seq = htonl(seq); - th->cwr = 0; - } while (skb->next); - - delta = htonl(oldlen + (skb->tail - skb->transport_header) + - skb->data_len); - th->check = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); - -out: - return segs; -} -EXPORT_SYMBOL(tcp_tso_segment); - -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - struct sk_buff **pp = NULL; - struct sk_buff *p; - struct tcphdr *th; - struct tcphdr *th2; - unsigned int len; - unsigned int thlen; - __be32 flags; - unsigned int mss = 1; - unsigned int hlen; - unsigned int off; - int flush = 1; - int i; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*th); - th = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - goto out; - } - - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - hlen = off + thlen; - if (skb_gro_header_hard(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - goto out; - } - - skb_gro_pull(skb, thlen); - - len = skb_gro_len(skb); - flags = tcp_flag_word(th); - - for (; (p = *head); head = &p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - th2 = tcp_hdr(p); - - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - goto found; - } - - goto out_check_final; - -found: - flush = NAPI_GRO_CB(p)->flush; - flush |= (__force int)(flags & TCP_FLAG_CWR); - flush |= (__force int)((flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); - flush |= (__force int)(th->ack_seq ^ th2->ack_seq); - for (i = sizeof(*th); i < thlen; i += 4) - flush |= *(u32 *)((u8 *)th + i) ^ - *(u32 *)((u8 *)th2 + i); - - mss = skb_shinfo(p)->gso_size; - - flush |= (len - 1) >= mss; - flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - - if (flush || skb_gro_receive(head, skb)) { - mss = 1; - goto out_check_final; - } - - p = *head; - th2 = tcp_hdr(p); - tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); - -out_check_final: - flush = len < mss; - flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | - TCP_FLAG_RST | TCP_FLAG_SYN | - TCP_FLAG_FIN)); - - if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; - -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} -EXPORT_SYMBOL(tcp_gro_receive); - -int tcp_gro_complete(struct sk_buff *skb) -{ - struct tcphdr *th = tcp_hdr(skb); - - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - - skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - - if (th->cwr) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - - return 0; -} -EXPORT_SYMBOL(tcp_gro_complete); - #ifdef CONFIG_TCP_MD5SIG -static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; -static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; +static DEFINE_MUTEX(tcp_md5sig_mutex); static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) { @@ -3128,87 +2983,45 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) free_percpu(pool); } -void tcp_free_md5sig_pool(void) -{ - struct tcp_md5sig_pool __percpu *pool = NULL; - - spin_lock_bh(&tcp_md5sig_pool_lock); - if (--tcp_md5sig_users == 0) { - pool = tcp_md5sig_pool; - tcp_md5sig_pool = NULL; - } - spin_unlock_bh(&tcp_md5sig_pool_lock); - if (pool) - __tcp_free_md5sig_pool(pool); -} -EXPORT_SYMBOL(tcp_free_md5sig_pool); - -static struct tcp_md5sig_pool __percpu * -__tcp_alloc_md5sig_pool(struct sock *sk) +static void __tcp_alloc_md5sig_pool(void) { int cpu; struct tcp_md5sig_pool __percpu *pool; pool = alloc_percpu(struct tcp_md5sig_pool); if (!pool) - return NULL; + return; for_each_possible_cpu(cpu) { struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (!hash || IS_ERR(hash)) + if (IS_ERR_OR_NULL(hash)) goto out_free; per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; } - return pool; + /* before setting tcp_md5sig_pool, we must commit all writes + * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() + */ + smp_wmb(); + tcp_md5sig_pool = pool; + return; out_free: __tcp_free_md5sig_pool(pool); - return NULL; } -struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) +bool tcp_alloc_md5sig_pool(void) { - struct tcp_md5sig_pool __percpu *pool; - bool alloc = false; - -retry: - spin_lock_bh(&tcp_md5sig_pool_lock); - pool = tcp_md5sig_pool; - if (tcp_md5sig_users++ == 0) { - alloc = true; - spin_unlock_bh(&tcp_md5sig_pool_lock); - } else if (!pool) { - tcp_md5sig_users--; - spin_unlock_bh(&tcp_md5sig_pool_lock); - cpu_relax(); - goto retry; - } else - spin_unlock_bh(&tcp_md5sig_pool_lock); - - if (alloc) { - /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool __percpu *p; - - p = __tcp_alloc_md5sig_pool(sk); - spin_lock_bh(&tcp_md5sig_pool_lock); - if (!p) { - tcp_md5sig_users--; - spin_unlock_bh(&tcp_md5sig_pool_lock); - return NULL; - } - pool = tcp_md5sig_pool; - if (pool) { - /* oops, it has already been assigned. */ - spin_unlock_bh(&tcp_md5sig_pool_lock); - __tcp_free_md5sig_pool(p); - } else { - tcp_md5sig_pool = pool = p; - spin_unlock_bh(&tcp_md5sig_pool_lock); - } + if (unlikely(!tcp_md5sig_pool)) { + mutex_lock(&tcp_md5sig_mutex); + + if (!tcp_md5sig_pool) + __tcp_alloc_md5sig_pool(); + + mutex_unlock(&tcp_md5sig_mutex); } - return pool; + return tcp_md5sig_pool != NULL; } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -3225,28 +3038,15 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) struct tcp_md5sig_pool __percpu *p; local_bh_disable(); - - spin_lock(&tcp_md5sig_pool_lock); - p = tcp_md5sig_pool; - if (p) - tcp_md5sig_users++; - spin_unlock(&tcp_md5sig_pool_lock); - + p = ACCESS_ONCE(tcp_md5sig_pool); if (p) - return this_cpu_ptr(p); + return __this_cpu_ptr(p); local_bh_enable(); return NULL; } EXPORT_SYMBOL(tcp_get_md5sig_pool); -void tcp_put_md5sig_pool(void) -{ - local_bh_enable(); - tcp_free_md5sig_pool(); -} -EXPORT_SYMBOL(tcp_put_md5sig_pool); - int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, const struct tcphdr *th) { @@ -3285,8 +3085,11 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, for (i = 0; i < shi->nr_frags; ++i) { const struct skb_frag_struct *f = &shi->frags[i]; - struct page *page = skb_frag_page(f); - sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); + unsigned int offset = f->page_offset; + struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); + + sg_set_page(&sg, page, skb_frag_size(f), + offset_in_page(offset)); if (crypto_hash_update(desc, &sg, skb_frag_size(f))) return 1; } @@ -3310,142 +3113,17 @@ EXPORT_SYMBOL(tcp_md5_hash_key); #endif -/** - * Each Responder maintains up to two secret values concurrently for - * efficient secret rollover. Each secret value has 4 states: - * - * Generating. (tcp_secret_generating != tcp_secret_primary) - * Generates new Responder-Cookies, but not yet used for primary - * verification. This is a short-term state, typically lasting only - * one round trip time (RTT). - * - * Primary. (tcp_secret_generating == tcp_secret_primary) - * Used both for generation and primary verification. - * - * Retiring. (tcp_secret_retiring != tcp_secret_secondary) - * Used for verification, until the first failure that can be - * verified by the newer Generating secret. At that time, this - * cookie's state is changed to Secondary, and the Generating - * cookie's state is changed to Primary. This is a short-term state, - * typically lasting only one round trip time (RTT). - * - * Secondary. (tcp_secret_retiring == tcp_secret_secondary) - * Used for secondary verification, after primary verification - * failures. This state lasts no more than twice the Maximum Segment - * Lifetime (2MSL). Then, the secret is discarded. - */ -struct tcp_cookie_secret { - /* The secret is divided into two parts. The digest part is the - * equivalent of previously hashing a secret and saving the state, - * and serves as an initialization vector (IV). The message part - * serves as the trailing secret. - */ - u32 secrets[COOKIE_WORKSPACE_WORDS]; - unsigned long expires; -}; - -#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) -#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) -#define TCP_SECRET_LIFE (HZ * 600) - -static struct tcp_cookie_secret tcp_secret_one; -static struct tcp_cookie_secret tcp_secret_two; - -/* Essentially a circular list, without dynamic allocation. */ -static struct tcp_cookie_secret *tcp_secret_generating; -static struct tcp_cookie_secret *tcp_secret_primary; -static struct tcp_cookie_secret *tcp_secret_retiring; -static struct tcp_cookie_secret *tcp_secret_secondary; - -static DEFINE_SPINLOCK(tcp_secret_locker); - -/* Select a pseudo-random word in the cookie workspace. - */ -static inline u32 tcp_cookie_work(const u32 *ws, const int n) -{ - return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; -} - -/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. - * Called in softirq context. - * Returns: 0 for success. - */ -int tcp_cookie_generator(u32 *bakery) -{ - unsigned long jiffy = jiffies; - - if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { - spin_lock_bh(&tcp_secret_locker); - if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { - /* refreshed by another */ - memcpy(bakery, - &tcp_secret_generating->secrets[0], - COOKIE_WORKSPACE_WORDS); - } else { - /* still needs refreshing */ - get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); - - /* The first time, paranoia assumes that the - * randomization function isn't as strong. But, - * this secret initialization is delayed until - * the last possible moment (packet arrival). - * Although that time is observable, it is - * unpredictably variable. Mash in the most - * volatile clock bits available, and expire the - * secret extra quickly. - */ - if (unlikely(tcp_secret_primary->expires == - tcp_secret_secondary->expires)) { - struct timespec tv; - - getnstimeofday(&tv); - bakery[COOKIE_DIGEST_WORDS+0] ^= - (u32)tv.tv_nsec; - - tcp_secret_secondary->expires = jiffy - + TCP_SECRET_1MSL - + (0x0f & tcp_cookie_work(bakery, 0)); - } else { - tcp_secret_secondary->expires = jiffy - + TCP_SECRET_LIFE - + (0xff & tcp_cookie_work(bakery, 1)); - tcp_secret_primary->expires = jiffy - + TCP_SECRET_2MSL - + (0x1f & tcp_cookie_work(bakery, 2)); - } - memcpy(&tcp_secret_secondary->secrets[0], - bakery, COOKIE_WORKSPACE_WORDS); - - rcu_assign_pointer(tcp_secret_generating, - tcp_secret_secondary); - rcu_assign_pointer(tcp_secret_retiring, - tcp_secret_primary); - /* - * Neither call_rcu() nor synchronize_rcu() needed. - * Retiring data is not freed. It is replaced after - * further (locked) pointer updates, and a quiet time - * (minimum 1MSL, maximum LIFE - 2MSL). - */ - } - spin_unlock_bh(&tcp_secret_locker); - } else { - rcu_read_lock_bh(); - memcpy(bakery, - &rcu_dereference(tcp_secret_generating)->secrets[0], - COOKIE_WORKSPACE_WORDS); - rcu_read_unlock_bh(); - } - return 0; -} -EXPORT_SYMBOL(tcp_cookie_generator); - void tcp_done(struct sock *sk) { + struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); + if (req != NULL) + reqsk_fastopen_remove(sk, req, false); sk->sk_shutdown = SHUTDOWN_MASK; @@ -3474,13 +3152,13 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); -void tcp_init_mem(struct net *net) +static void tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; + sysctl_tcp_mem[1] = limit; + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; } void __init tcp_init(void) @@ -3489,7 +3167,6 @@ void __init tcp_init(void) unsigned long limit; int max_rshare, max_wshare, cnt; unsigned int i; - unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3509,25 +3186,22 @@ void __init tcp_init(void) alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, - (totalram_pages >= 128 * 1024) ? - 13 : 15, + 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); - for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); - INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); - } + if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, - (totalram_pages >= 128 * 1024) ? - 13 : 15, + 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, @@ -3546,7 +3220,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - tcp_init_mem(&init_net); + tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); @@ -3563,14 +3237,9 @@ void __init tcp_init(void) pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + tcp_metrics_init(); + tcp_register_congestion_control(&tcp_reno); - memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); - memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); - tcp_secret_one.expires = jiffy; /* past due */ - tcp_secret_two.expires = jiffy; /* past due */ - tcp_secret_generating = &tcp_secret_one; - tcp_secret_primary = &tcp_secret_one; - tcp_secret_retiring = &tcp_secret_two; - tcp_secret_secondary = &tcp_secret_two; + tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index f45e1c24244..d5de69bc04f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -140,16 +140,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 04dbd7ae7c6..7b09d8b49fa 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,7 +1,7 @@ /* * Plugable TCP congestion control support and newReno * congestion control. - * Based on ideas from I/O scheduler suport and Web100. + * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ @@ -15,8 +15,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_max_ssthresh = 0; - static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); @@ -259,7 +257,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (!ca) err = -ENOENT; - else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) err = -EPERM; else if (!try_module_get(ca->owner)) @@ -277,65 +276,24 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer +/* Slow start is used when congestion window is no greater than the slow start + * threshold. We base on RFC2581 and also handle stretch ACKs properly. + * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but + * something better;) a packet is only considered (s)acked in its entirety to + * defend the ACK attacks described in the RFC. Slow start processes a stretch + * ACK of degree N as if N acks of degree 1 are received back to back except + * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and + * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +int tcp_slow_start(struct tcp_sock *tp, u32 acked) { - const struct tcp_sock *tp = tcp_sk(sk); - u32 left; - - if (in_flight >= tp->snd_cwnd) - return true; + u32 cwnd = tp->snd_cwnd + acked; - left = tp->snd_cwnd - in_flight; - if (sk_can_gso(sk) && - left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size) - return true; - return left <= tcp_max_tso_deferred_mss(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); - -/* - * Slow start is used when congestion window is less than slow start - * threshold. This version implements the basic RFC2581 version - * and optionally supports: - * RFC3742 Limited Slow Start - growth limited to max_ssthresh - * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged - */ -void tcp_slow_start(struct tcp_sock *tp) -{ - int cnt; /* increase in packets */ - - /* RFC3465: ABC Slow start - * Increase only after a full MSS of bytes is acked - * - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) - return; - - if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) - cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ - else - cnt = tp->snd_cwnd; /* exponential increase */ - - /* RFC3465: ABC - * We MAY increase by 2 if discovered delayed ack - */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) - cnt <<= 1; - tp->bytes_acked = 0; - - tp->snd_cwnd_cnt += cnt; - while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd_cnt -= tp->snd_cwnd; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } + if (cwnd > tp->snd_ssthresh) + cwnd = tp->snd_ssthresh + 1; + acked -= cwnd - tp->snd_cwnd; + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -359,30 +317,19 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); - + tcp_slow_start(tp, acked); /* In dangerous area, increase slowly. */ - else if (sysctl_tcp_abc) { - /* RFC3465: Appropriate Byte Count - * increase once for each full cwnd acked - */ - if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { - tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } else { + else tcp_cong_avoid_ai(tp, tp->snd_cwnd); - } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); @@ -394,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); - struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; /* Initial congestion control used (until SYN) @@ -420,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = { .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9077f441cb..a9bd8a4828a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -206,8 +206,8 @@ static u32 cubic_root(u64 a) */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { - u64 offs; - u32 delta, t, bic_target, max_cnt; + u32 delta, bic_target, max_cnt; + u64 offs, t; ca->ack_cnt++; /* count the number of ACKs */ @@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * if the cwnd < 1 million packets !!! */ + t = (s32)(tcp_time_stamp - ca->epoch_start); + t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - - ca->epoch_start) << BICTCP_HZ) / HZ; + t <<= BICTCP_HZ; + do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; @@ -302,18 +304,18 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); - tcp_slow_start(tp); + tcp_slow_start(tp, acked); } else { bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); @@ -406,7 +408,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; ratio += cnt; - ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); + ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ @@ -414,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) return; /* Discard delay samples right after fast recovery */ - if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; delay = (rtt_us << 3) / USEC_PER_MSEC; @@ -473,10 +475,6 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - /* hystart needs ms clock resolution */ - if (hystart && HZ < 1000) - cubictcp.flags |= TCP_CONG_RTT_STAMP; - return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c new file mode 100644 index 00000000000..9771563ab56 --- /dev/null +++ b/net/ipv4/tcp_fastopen.c @@ -0,0 +1,295 @@ +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/tcp.h> +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <net/inetpeer.h> +#include <net/tcp.h> + +int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; + +struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; + +static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); + +void tcp_fastopen_init_key_once(bool publish) +{ + static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + /* tcp_fastopen_reset_cipher publishes the new context + * atomically, so we allow this race happening here. + * + * All call sites of tcp_fastopen_cookie_gen also check + * for a valid cookie, so this is an acceptable risk. + */ + if (net_get_random_once(key, sizeof(key)) && publish) + tcp_fastopen_reset_cipher(key, sizeof(key)); +} + +static void tcp_fastopen_ctx_free(struct rcu_head *head) +{ + struct tcp_fastopen_context *ctx = + container_of(head, struct tcp_fastopen_context, rcu); + crypto_free_cipher(ctx->tfm); + kfree(ctx); +} + +int tcp_fastopen_reset_cipher(void *key, unsigned int len) +{ + int err; + struct tcp_fastopen_context *ctx, *octx; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->tfm = crypto_alloc_cipher("aes", 0, 0); + + if (IS_ERR(ctx->tfm)) { + err = PTR_ERR(ctx->tfm); +error: kfree(ctx); + pr_err("TCP: TFO aes cipher alloc error: %d\n", err); + return err; + } + err = crypto_cipher_setkey(ctx->tfm, key, len); + if (err) { + pr_err("TCP: TFO cipher key error: %d\n", err); + crypto_free_cipher(ctx->tfm); + goto error; + } + memcpy(ctx->key, key, len); + + spin_lock(&tcp_fastopen_ctx_lock); + + octx = rcu_dereference_protected(tcp_fastopen_ctx, + lockdep_is_held(&tcp_fastopen_ctx_lock)); + rcu_assign_pointer(tcp_fastopen_ctx, ctx); + spin_unlock(&tcp_fastopen_ctx_lock); + + if (octx) + call_rcu(&octx->rcu, tcp_fastopen_ctx_free); + return err; +} + +static bool __tcp_fastopen_cookie_gen(const void *path, + struct tcp_fastopen_cookie *foc) +{ + struct tcp_fastopen_context *ctx; + bool ok = false; + + tcp_fastopen_init_key_once(true); + + rcu_read_lock(); + ctx = rcu_dereference(tcp_fastopen_ctx); + if (ctx) { + crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + ok = true; + } + rcu_read_unlock(); + return ok; +} + +/* Generate the fastopen cookie by doing aes128 encryption on both + * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 + * addresses. For the longer IPv6 addresses use CBC-MAC. + * + * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. + */ +static bool tcp_fastopen_cookie_gen(struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + if (req->rsk_ops->family == AF_INET) { + const struct iphdr *iph = ip_hdr(syn); + + __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; + return __tcp_fastopen_cookie_gen(path, foc); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (req->rsk_ops->family == AF_INET6) { + const struct ipv6hdr *ip6h = ipv6_hdr(syn); + struct tcp_fastopen_cookie tmp; + + if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + struct in6_addr *buf = (struct in6_addr *) tmp.val; + int i = 4; + + for (i = 0; i < 4; i++) + buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; + return __tcp_fastopen_cookie_gen(buf, foc); + } + } +#endif + return false; +} + +static bool tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) +{ + struct tcp_sock *tp; + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + struct sock *child; + + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + return false; + + spin_lock(&queue->fastopenq->lock); + queue->fastopenq->qlen++; + spin_unlock(&queue->fastopenq->lock); + + /* Initialize the child socket. Have to fix some values to take + * into account the child is a Fast Open socket and is created + * only out of the bits carried in the SYN packet. + */ + tp = tcp_sk(child); + + tp->fastopen_rsk = req; + /* Do a hold on the listner sk so that if the listener is being + * closed, the child that has been accepted can live on and still + * access listen_lock. + */ + sock_hold(sk); + tcp_rsk(req)->listener = sk; + + /* RFC1323: The window in SYN & SYN/ACK segments is never + * scaled. So correct it appropriately. + */ + tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + + /* Activate the retrans timer so that SYNACK can be retransmitted. + * The request socket is not added to the SYN table of the parent + * because it's been added to the accept queue directly. + */ + inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); + + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, child); + + /* Now finish processing the fastopen child socket. */ + inet_csk(child)->icsk_af_ops->rebuild_header(child); + tcp_init_congestion_control(child); + tcp_mtup_init(child); + tcp_init_metrics(child); + tcp_init_buffer_space(child); + + /* Queue the data carried in the SYN packet. We need to first + * bump skb's refcnt because the caller will attempt to free it. + * + * XXX (TFO) - we honor a zero-payload TFO request for now, + * (any reason not to?) but no need to queue the skb since + * there is no data. How about SYN+FIN? + */ + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { + skb = skb_get(skb); + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + skb_set_owner_r(skb, child); + __skb_queue_tail(&child->sk_receive_queue, skb); + tp->syn_data_acked = 1; + } + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + sk->sk_data_ready(sk); + bh_unlock_sock(child); + sock_put(child); + WARN_ON(req->sk == NULL); + return true; +} +EXPORT_SYMBOL(tcp_fastopen_create_child); + +static bool tcp_fastopen_queue_check(struct sock *sk) +{ + struct fastopen_queue *fastopenq; + + /* Make sure the listener has enabled fastopen, and we don't + * exceed the max # of pending TFO requests allowed before trying + * to validating the cookie in order to avoid burning CPU cycles + * unnecessarily. + * + * XXX (TFO) - The implication of checking the max_qlen before + * processing a cookie request is that clients can't differentiate + * between qlen overflow causing Fast Open to be disabled + * temporarily vs a server not supporting Fast Open at all. + */ + fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq == NULL || fastopenq->max_qlen == 0) + return false; + + if (fastopenq->qlen >= fastopenq->max_qlen) { + struct request_sock *req1; + spin_lock(&fastopenq->lock); + req1 = fastopenq->rskq_rst_head; + if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + spin_unlock(&fastopenq->lock); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); + return false; + } + fastopenq->rskq_rst_head = req1->dl_next; + fastopenq->qlen--; + spin_unlock(&fastopenq->lock); + reqsk_free(req1); + } + return true; +} + +/* Returns true if we should perform Fast Open on the SYN. The cookie (foc) + * may be updated and return the client in the SYN-ACK later. E.g., Fast Open + * cookie request (foc->len == 0). + */ +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) +{ + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; + bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + + if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + (syn_data || foc->len >= 0) && + tcp_fastopen_queue_check(sk))) { + foc->len = -1; + return false; + } + + if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + goto fastopen; + + if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + foc->len == TCP_FASTOPEN_COOKIE_SIZE && + foc->len == valid_foc.len && + !memcmp(foc->val, valid_foc.val, foc->len)) { + /* Cookie is valid. Create a (full) child socket to accept + * the data in SYN before returning a SYN-ACK to ack the + * data. If we fail to create the socket, fall back and + * ack the ISN only but includes the same cookie. + * + * Note: Data-less SYN with valid cookie is allowed to send + * data in SYN_RECV state. + */ +fastopen: + if (tcp_fastopen_create_child(sk, skb, dst, req)) { + foc->len = -1; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return true; + } + } + + NET_INC_STATS_BH(sock_net(sk), foc->len ? + LINUX_MIB_TCPFASTOPENPASSIVEFAIL : + LINUX_MIB_TCPFASTOPENCOOKIEREQD); + *foc = valid_foc; + return false; +} +EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 30f27f6b365..1c4908280d9 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,16 +109,16 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); else { /* Update AIMD parameters. * @@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "highspeed" diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index c1a8175361e..031361311a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,16 +227,16 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 57bdd17dff4..d8f8f05a495 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -21,7 +21,7 @@ struct hybla { u32 rho2; /* Rho * Rho, integer part */ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ - u32 minrtt; /* Minimum smoothed round trip time value seen */ + u32 minrtt_us; /* Minimum smoothed round trip time value seen */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ @@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk) { struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, + tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), + 8U); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >> 7; @@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk) hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; tp->snd_cwnd = ca->rho; } @@ -85,7 +87,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -93,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ - if (tp->srtt < ca->minrtt){ + if (tp->srtt_us < ca->minrtt_us) { hybla_recalc_param(sk); - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; } - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (!ca->hybla_en) { - tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -165,7 +167,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 813b43a76fe..5999b3972e6 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -23,7 +23,6 @@ #define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */ #define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */ #define ALPHA_BASE ALPHA_SCALE /* 1.0 */ -#define U32_MAX ((u32)~0U) #define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */ #define BETA_SHIFT 6 @@ -256,7 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); @@ -265,12 +264,12 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) update_params(sk); /* RFC2861 only increase cwnd if fully utilized */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In slow start */ if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); else { u32 delta; @@ -313,20 +312,20 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, .tcpv_rttcnt = ca->cnt_rtt, .tcpv_minrtt = ca->base_rtt, }; - u64 t = ca->sum_rtt; - do_div(t, ca->cnt_rtt); - info.tcpv_rtt = t; + if (info.tcpv_rttcnt > 0) { + u64 t = ca->sum_rtt; + do_div(t, info.tcpv_rttcnt); + info.tcpv_rtt = t; + } nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } static struct tcp_congestion_ops tcp_illinois __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,25 +81,23 @@ int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; EXPORT_SYMBOL(sysctl_tcp_reordering); -int sysctl_tcp_ecn __read_mostly = 2; -EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 100; + int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; -int sysctl_tcp_frto_response __read_mostly; -int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_abc __read_mostly; -int sysctl_tcp_early_retrans __read_mostly = 2; +int sysctl_tcp_early_retrans __read_mostly = 3; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -109,17 +107,16 @@ int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ -#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ -#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) -#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -235,7 +232,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + /* Better not delay acks, sender can have a very low cwnd */ + tcp_enter_quickack_mode((struct sock *)tp); + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } /* fallinto */ default: tp->ecn_flags |= TCP_ECN_SEEN; @@ -266,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ -static void tcp_fixup_sndbuf(struct sock *sk) +static void tcp_sndbuf_expand(struct sock *sk) { - int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); + const struct tcp_sock *tp = tcp_sk(sk); + int sndmem, per_mss; + u32 nr_segs; + + /* Worst case is non GSO/TSO : each frame consumes one skb + * and skb->head is kmalloced using power of two area of memory + */ + per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + + MAX_TCP_HEADER + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + per_mss = roundup_pow_of_two(per_mss) + + SKB_DATA_ALIGN(sizeof(struct sk_buff)); + + nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); + nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + + /* Fast Recovery (RFC 5681 3.2) : + * Cubic needs 1.7 factor, rounded to 2 to include + * extra cushion (application might react slowly to POLLOUT) + */ + sndmem = 2 * nr_segs * per_mss; - sndmem *= TCP_INIT_CWND; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); } @@ -346,24 +367,19 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) } /* 3. Tuning rcvbuf, when connection enters established state. */ - static void tcp_fixup_rcvbuf(struct sock *sk) { u32 mss = tcp_sk(sk)->advmss; - u32 icwnd = TCP_DEFAULT_INIT_RCVWND; int rcvmem; - /* Limit to 10 segments if mss <= 1460, - * or 14600/mss segments, with a minimum of two segments. - */ - if (mss > 1460) - icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - - rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < mss) - rcvmem += 128; + rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * + tcp_default_init_rwnd(mss); - rcvmem *= icwnd; + /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency + * Allow enough cushion so that sender is not limited by our window + */ + if (sysctl_tcp_moderate_rcvbuf) + rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); @@ -372,7 +388,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) /* 4. Try to fixup all. It is made immediately after connection enters * established state. */ -static void tcp_init_buffer_space(struct sock *sk) +void tcp_init_buffer_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -380,9 +396,11 @@ static void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) - tcp_fixup_sndbuf(sk); + tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; + tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -522,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time; - int space; - - if (tp->rcvq_space.time == 0) - goto new_measure; + int copied; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; - space = 2 * (tp->copied_seq - tp->rcvq_space.seq); + /* Number of bytes copied to user in last RTT */ + copied = tp->copied_seq - tp->rcvq_space.seq; + if (copied <= tp->rcvq_space.space) + goto new_measure; - space = max(tp->rcvq_space.space, space); + /* A bit of theory : + * copied = bytes received in previous RTT, our base window + * To cope with packet losses, we need a 2x factor + * To cope with slow start, and sender growing its cwin by 100 % + * every RTT, we need a 4x factor, because the ACK we are sending + * now is for the next RTT, not the current one : + * <prev RTT . ><current RTT .. ><next RTT .... > + */ - if (tp->rcvq_space.space != space) { - int rcvmem; + if (sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvwin, rcvmem, rcvbuf; - tp->rcvq_space.space = space; + /* minimal window to cope with packet losses, assuming + * steady state. Add some cushion because of small variations. + */ + rcvwin = (copied << 1) + 16 * tp->advmss; - if (sysctl_tcp_moderate_rcvbuf && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int new_clamp = space; + /* If rate increased by 25%, + * assume slow start, rcvwin = 3 * copied + * If rate increased by 50%, + * assume sender can use 2x growth, rcvwin = 4 * copied + */ + if (copied >= + tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { + if (copied >= + tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) + rcvwin <<= 1; + else + rcvwin += (rcvwin >> 1); + } - /* Receive space grows, normalize in order to - * take into account packet headers and sk_buff - * structure overhead. - */ - space /= tp->advmss; - if (!space) - space = 1; - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) - rcvmem += 128; - space *= rcvmem; - space = min(space, sysctl_tcp_rmem[2]); - if (space > sk->sk_rcvbuf) { - sk->sk_rcvbuf = space; - - /* Make the window clamp follow along. */ - tp->window_clamp = new_clamp; - } + rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + + rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + if (rcvbuf > sk->sk_rcvbuf) { + sk->sk_rcvbuf = rcvbuf; + + /* Make the window clamp follow along. */ + tp->window_clamp = rcvwin; } } + tp->rcvq_space.space = copied; new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -635,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); - long m = mrtt; /* RTT */ + long m = mrtt_us; /* RTT */ + u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -656,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -675,33 +706,62 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (m > 0) m >>= 3; } else { - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ } - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev > tp->mdev_max) { - tp->mdev_max = tp->mdev; - if (tp->mdev_max > tp->rttvar) - tp->rttvar = tp->mdev_max; + tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev_us > tp->mdev_max_us) { + tp->mdev_max_us = tp->mdev_us; + if (tp->mdev_max_us > tp->rttvar_us) + tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + if (tp->mdev_max_us < tp->rttvar_us) + tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = tcp_rto_min(sk); + tp->mdev_max_us = tcp_rto_min_us(sk); } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev = m << 1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ + tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); + tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; } + tp->srtt_us = max(1U, srtt); +} + +/* Set the sk_pacing_rate to allow proper sizing of TSO packets. + * Note: TCP stack does not yet implement pacing. + * FQ packet scheduler can be used to implement cheap but effective + * TCP pacing, to smooth the burst on large writes when packets + * in flight is significantly lower than cwnd (or rwin) + */ +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (likely(tp->srtt_us)) + do_div(rate, tp->srtt_us); + + /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate + * without any lock. We want to make sure compiler wont store + * intermediate values in this location. + */ + ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, + sk->sk_max_pacing_rate); } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -728,109 +788,6 @@ static inline void tcp_set_rto(struct sock *sk) tcp_bound_rto(sk); } -/* Save metrics learned by this TCP session. - This function is called only, when TCP finishes successfully - i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. - */ -void tcp_update_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (sysctl_tcp_nometrics_save) - return; - - dst_confirm(dst); - - if (dst && (dst->flags & DST_HOST)) { - const struct inet_connection_sock *icsk = inet_csk(sk); - int m; - unsigned long rtt; - - if (icsk->icsk_backoff || !tp->srtt) { - /* This session failed to estimate rtt. Why? - * Probably, no packets returned in time. - * Reset our results. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) - dst_metric_set(dst, RTAX_RTT, 0); - return; - } - - rtt = dst_metric_rtt(dst, RTAX_RTT); - m = rtt - tp->srtt; - - /* If newly calculated rtt larger than stored one, - * store new one. Otherwise, use EWMA. Remember, - * rtt overestimation is always better than underestimation. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) { - if (m <= 0) - set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); - else - set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); - } - - if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { - unsigned long var; - if (m < 0) - m = -m; - - /* Scale deviation to rttvar fixed point */ - m >>= 1; - if (m < tp->mdev) - m = tp->mdev; - - var = dst_metric_rtt(dst, RTAX_RTTVAR); - if (m >= var) - var = m; - else - var -= (var - m) >> 2; - - set_dst_metric_rtt(dst, RTAX_RTTVAR, var); - } - - if (tcp_in_initial_slowstart(tp)) { - /* Slow start still did not finish. */ - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); - if (!dst_metric_locked(dst, RTAX_CWND) && - tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); - } else if (tp->snd_cwnd > tp->snd_ssthresh && - icsk->icsk_ca_state == TCP_CA_Open) { - /* Cong. avoidance phase, cwnd is reliable. */ - if (!dst_metric_locked(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_cwnd) >> 1); - } else { - /* Else slow start did not finish, cwnd is non-sense, - ssthresh may be also invalid. - */ - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_ssthresh) >> 1); - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); - } - - if (!dst_metric_locked(dst, RTAX_REORDERING)) { - if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && - tp->reordering != sysctl_tcp_reordering) - dst_metric_set(dst, RTAX_REORDERING, tp->reordering); - } - } -} - __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -840,34 +797,11 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) -{ - struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - tp->prior_ssthresh = 0; - tp->bytes_acked = 0; - if (icsk->icsk_ca_state < TCP_CA_CWR) { - tp->undo_marker = 0; - if (set_ssthresh) - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - tp->snd_cwnd_stamp = tcp_time_stamp; - TCP_ECN_queue_cwr(tp); - - tcp_set_ca_state(sk, TCP_CA_CWR); - } -} - /* * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */ -static void tcp_disable_fack(struct tcp_sock *tp) +void tcp_disable_fack(struct tcp_sock *tp) { /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) @@ -881,86 +815,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; } -/* Initialize metrics on socket. */ - -static void tcp_init_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (dst == NULL) - goto reset; - - dst_confirm(dst); - - if (dst_metric_locked(dst, RTAX_CWND)) - tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { - tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - } - if (dst_metric(dst, RTAX_REORDERING) && - tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); - tp->reordering = dst_metric(dst, RTAX_REORDERING); - } - - if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) - goto reset; - - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. - * - * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal circumstances sending small - * packets force peer to delay ACKs and calculation is correct too. - * The algorithm is adaptive and, provided we follow specs, it - * NEVER underestimate RTT. BUT! If peer tries to make some clever - * tricks sort of "quick acks" for time long enough to decrease RTT - * to low value, and then abruptly stops to do it and starts to delay - * ACKs, wait for troubles. - */ - if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric_rtt(dst, RTAX_RTT); - tp->rtt_seq = tp->snd_nxt; - } - if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } - tcp_set_rto(sk); -reset: - if (tp->srtt == 0) { - /* RFC6298: 5.7 We've failed to get a valid RTT sample from - * 3WHS. This is most likely due to retransmission, - * including spurious one. Reset the RTO back to 3secs - * from the more aggressive 1sec to avoid more spurious - * retransmission. - */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; - } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static void tcp_update_reordering(struct sock *sk, const int metric, const int ts) { @@ -1252,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1261,9 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; - int flag; + int reord; + int fack_count; + long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + int flag; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1307,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; } - err = tcp_fragment(sk, skb, pkt_len, mss); + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1324,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - bool dup_sack, int pcount) + int dup_sack, int pcount, + const struct skb_mstamp *xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1362,10 +1218,16 @@ static u8 tcp_sacktag_one(struct sock *sk, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); - - /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(end_seq, tp->frto_highmark)) - state->flag |= FLAG_ONLY_ORIG_SACKED; + if (!after(end_seq, tp->high_seq)) + state->flag |= FLAG_ORIG_SACK_ACKED; + /* Pick the earliest sequence sacked for RTT */ + if (state->rtt_us < 0) { + struct skb_mstamp now; + + skb_mstamp_get(&now); + state->rtt_us = skb_mstamp_us_delta(&now, + xmit_time); + } } if (sacked & TCPCB_LOST) { @@ -1423,7 +1285,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount); + start_seq, end_seq, dup_sack, pcount, + &skb->skb_mstamp); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1464,14 +1327,15 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->scoreboard_skb_hint) - tp->scoreboard_skb_hint = prev; if (skb == tp->lost_skb_hint) { tp->lost_skb_hint = prev; tp->lost_cnt_hint -= tcp_skb_pcount(prev); } - TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; + TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + TCP_SKB_CB(prev)->end_seq++; + if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); @@ -1699,7 +1563,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, - tcp_skb_pcount(skb)); + tcp_skb_pcount(skb), + &skb->skb_mstamp); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1756,9 +1621,8 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una) + u32 prior_snd_una, long *sack_rtt_us) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); @@ -1775,6 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; + state.rtt_us = -1L; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1931,12 +1796,6 @@ walk: start_seq, end_seq, dup_sack); advance_sp: - /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct - * due to in-order walk - */ - if (after(end_seq, tp->frto_highmark)) - state.flag &= ~FLAG_ONLY_ORIG_SACKED; - i++; } @@ -1953,8 +1812,7 @@ advance_sp: tcp_verify_left_out(tp); if ((state.reord < tp->fackets_out) && - ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && - (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) + ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); out: @@ -1965,6 +1823,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif + *sack_rtt_us = state.rtt_us; return state.flag; } @@ -2028,205 +1887,13 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static int tcp_is_sackfrto(const struct tcp_sock *tp) -{ - return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); -} - -/* F-RTO can only be used if TCP has never retransmitted anything other than - * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) - */ -bool tcp_use_frto(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - - if (!sysctl_tcp_frto) - return false; - - /* MTU probe and F-RTO won't really play nicely along currently */ - if (icsk->icsk_mtup.probe_size) - return false; - - if (tcp_is_sackfrto(tp)) - return true; - - /* Avoid expensive walking of rexmit queue if possible */ - if (tp->retrans_out > 1) - return false; - - skb = tcp_write_queue_head(sk); - if (tcp_skb_is_last(sk, skb)) - return true; - skb = tcp_write_queue_next(sk, skb); /* Skips head */ - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - return false; - /* Short-circuit when first non-SACKed skb has been checked */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - break; - } - return true; -} - -/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO - * recovery a bit and use heuristics in tcp_process_frto() to detect if - * the RTO was spurious. Only clear SACKED_RETRANS of the head here to - * keep retrans_out counting accurate (with SACK F-RTO, other than head - * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS - * bits are handled if the Loss state is really to be entered (in - * tcp_enter_frto_loss). - * - * Do like tcp_enter_loss() would; when RTO expires the second time it - * does: - * "Reduce ssthresh if it has not yet been made inside this window." - */ -void tcp_enter_frto(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || - tp->snd_una == tp->high_seq || - ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && - !icsk->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(sk); - /* Our state is too optimistic in ssthresh() call because cwnd - * is not reduced until tcp_enter_frto_loss() when previous F-RTO - * recovery has not yet completed. Pattern would be this: RTO, - * Cumulative ACK, RTO (2xRTO for the same segment does not end - * up here twice). - * RFC4138 should be more specific on what to do, even though - * RTO is quite unlikely to occur after the first Cumulative ACK - * due to back-off and complexity of triggering events ... - */ - if (tp->frto_counter) { - u32 stored_cwnd; - stored_cwnd = tp->snd_cwnd; - tp->snd_cwnd = 2; - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = stored_cwnd; - } else { - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - } - /* ... in theory, cong.control module could do "any tricks" in - * ssthresh(), which means that ca_state, lost bits and lost_out - * counter would have to be faked before the call occurs. We - * consider that too expensive, unlikely and hacky, so modules - * using these in ssthresh() must deal these incompatibility - * issues if they receives CA_EVENT_FRTO and frto_counter != 0 - */ - tcp_ca_event(sk, CA_EVENT_FRTO); - } - - tp->undo_marker = tp->snd_una; - tp->undo_retrans = 0; - - skb = tcp_write_queue_head(sk); - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - tp->undo_marker = 0; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - } - tcp_verify_left_out(tp); - - /* Too bad if TCP was application limited */ - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); - - /* Earlier loss recovery underway (see RFC4138; Appendix B). - * The last condition is necessary at least in tp->frto_counter case. - */ - if (tcp_is_sackfrto(tp) && (tp->frto_counter || - ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && - after(tp->high_seq, tp->snd_una)) { - tp->frto_highmark = tp->high_seq; - } else { - tp->frto_highmark = tp->snd_nxt; - } - tcp_set_ca_state(sk, TCP_CA_Disorder); - tp->high_seq = tp->snd_nxt; - tp->frto_counter = 1; -} - -/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, - * which indicates that we should follow the traditional RTO recovery, - * i.e. mark everything lost and do go-back-N retransmission. - */ -static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - tp->lost_out = 0; - tp->retrans_out = 0; - if (tcp_is_reno(tp)) - tcp_reset_reno_sack(tp); - - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - /* - * Count the retransmission made on RTO correctly (only when - * waiting for the first ACK and did not get it)... - */ - if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { - /* For some reason this R-bit might get cleared? */ - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out += tcp_skb_pcount(skb); - /* ...enter this if branch just for the first segment */ - flag |= FLAG_DATA_ACKED; - } else { - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - } - - /* Marking forward transmissions that were made after RTO lost - * can cause unnecessary retransmissions in some scenarios, - * SACK blocks will mitigate that in some but not in all cases. - * We used to not mark them but it was causing break-ups with - * receivers that do only in-order receival. - * - * TODO: we could detect presence of such receiver and select - * different behavior per flow. - */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; - } - } - tcp_verify_left_out(tp); - - tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; - tp->frto_counter = 0; - tp->bytes_acked = 0; - - tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); - tcp_set_ca_state(sk, TCP_CA_Loss); - tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); - - tcp_clear_all_retrans_hints(tp); -} - static void tcp_clear_retrans_partial(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; } void tcp_clear_retrans(struct tcp_sock *tp) @@ -2246,10 +1913,13 @@ void tcp_enter_loss(struct sock *sk, int how) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + bool new_recovery = false; /* Reduce ssthresh if it has not yet been made inside this window. */ - if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + if (icsk->icsk_ca_state <= TCP_CA_Disorder || + !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + new_recovery = true; tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); @@ -2258,17 +1928,13 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->bytes_acked = 0; tcp_clear_retrans_partial(tp); if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (!how) { - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - tp->undo_marker = tp->snd_una; - } else { + tp->undo_marker = tp->snd_una; + if (how) { tp->sacked_out = 0; tp->fackets_out = 0; } @@ -2278,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2290,13 +1957,24 @@ void tcp_enter_loss(struct sock *sk, int how) } tcp_verify_left_out(tp); - tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + /* Timeout in disordered state after receiving substantial DUPACKs + * suggests that the degree of reordering is over-estimated. + */ + if (icsk->icsk_ca_state <= TCP_CA_Disorder && + tp->sacked_out >= sysctl_tcp_reordering) + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); - /* Abort F-RTO algorithm if one is in progress */ - tp->frto_counter = 0; + + /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous + * loss recovery is underway except recurring timeout(s) on + * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + */ + tp->frto = sysctl_tcp_frto && + (new_recovery || icsk->icsk_retransmits) && + !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2355,32 +2033,21 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples * available, or RTO is scheduled to fire first. */ - if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) + if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || + (flag & FLAG_ECE) || !tp->srtt_us) return false; - delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + delay = max(usecs_to_jiffies(tp->srtt_us >> 5), + msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); - tp->early_retrans_delayed = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, + TCP_RTO_MAX); return true; } -static inline int tcp_skb_timedout(const struct sock *sk, - const struct sk_buff *skb) -{ - return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; -} - -static inline int tcp_head_timedout(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->packets_out && - tcp_skb_timedout(sk, tcp_write_queue_head(sk)); -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2479,10 +2146,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; - /* Do not perform any recovery during F-RTO algorithm */ - if (tp->frto_counter) - return false; - /* Trick#1: The loss is proven. */ if (tp->lost_out) return true; @@ -2491,12 +2154,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#3 : when we use RFC2988 timer restart, fast - * retransmit can be triggered by timeout of queue head. - */ - if (tcp_is_fack(tp) && tcp_head_timedout(sk)) - return true; - /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ @@ -2526,51 +2183,13 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) * interval if appropriate. */ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && + (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && !tcp_may_send_now(sk)) return !tcp_pause_early_retransmit(sk, flag); return false; } -/* New heuristics: it is possible only after we switched to restart timer - * each time when something is ACKed. Hence, we can detect timed out packets - * during fast retransmit without falling to slow start. - * - * Usefulness of this as is very questionable, since we should know which of - * the segments is the next to timeout which is relatively expensive to find - * in general case unless we add some data structure just for that. The - * current approach certainly won't find the right one too often and when it - * finally does find _something_ it usually marks large part of the window - * right away (because a retransmission with a larger timestamp blocks the - * loop from advancing). -ij - */ -static void tcp_timeout_skbs(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) - return; - - skb = tp->scoreboard_skb_hint; - if (tp->scoreboard_skb_hint == NULL) - skb = tcp_write_queue_head(sk); - - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (!tcp_skb_timedout(sk, skb)) - break; - - tcp_skb_mark_lost(tp, skb); - } - - tp->scoreboard_skb_hint = skb; - - tcp_verify_left_out(tp); -} - /* Detect loss in event "A" above by marking head of queue up as lost. * For FACK or non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it @@ -2622,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = skb_shinfo(skb)->gso_size; - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, + mss, GFP_ATOMIC); if (err < 0) break; cnt = packets; @@ -2656,8 +2276,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) else if (fast_rexmit) tcp_mark_head_lost(sk, 1, 1); } - - tcp_timeout_skbs(sk); } /* CWND moderation, preventing bursts due to too big ACKs @@ -2670,39 +2288,10 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } -/* Lower bound on congestion window is slow start threshold - * unless congestion avoidance choice decides to overide it. - */ -static inline u32 tcp_cwnd_min(const struct sock *sk) -{ - const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - - return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; -} - -/* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - int decr = tp->snd_cwnd_cnt + 1; - - if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || - (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { - tp->snd_cwnd_cnt = decr & 1; - decr >>= 1; - - if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) - tp->snd_cwnd -= decr; - - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); - tp->snd_cwnd_stamp = tcp_time_stamp; - } -} - /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ -static inline int tcp_packet_delayed(const struct tcp_sock *tp) +static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2741,10 +2330,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) +static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); + if (unmark_loss) { + struct sk_buff *skb; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + tp->lost_out = 0; + tcp_clear_all_retrans_hints(tp); + } + if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2753,7 +2354,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { + if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } @@ -2761,9 +2362,10 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; + tp->undo_marker = 0; } -static inline int tcp_may_undo(const struct tcp_sock *tp) +static inline bool tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } @@ -2780,14 +2382,13 @@ static bool tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, true); + tcp_undo_cwnd_reduction(sk, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS_BH(sock_net(sk), mib_idx); - tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2801,16 +2402,17 @@ static bool tcp_try_undo_recovery(struct sock *sk) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk) +static bool tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, true); - tp->undo_marker = 0; + tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + return true; } + return false; } /* We can clear retrans_stamp when there are no retransmissions in the @@ -2842,82 +2444,102 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } -/* Undo during fast recovery after partial ACK. */ - -static int tcp_try_undo_partial(struct sock *sk, int acked) +/* Undo during loss recovery after partial ACK or using F-RTO. */ +static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); - /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); - if (tcp_may_undo(tp)) { - /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. - */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; - - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + if (frto_undo || tcp_may_undo(tp)) { + tcp_undo_cwnd_reduction(sk, true); - DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, false); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); - - /* So... Do not make Hoe's retransmit yet. - * If the first packet was delayed, the rest - * ones are most probably delayed as well. - */ - failed = 0; + DBGUNDO(sk, "partial loss"); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); + if (frto_undo) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUSRTOS); + inet_csk(sk)->icsk_retransmits = 0; + if (frto_undo || tcp_is_sack(tp)) + tcp_set_ca_state(sk, TCP_CA_Open); + return true; } - return failed; + return false; } -/* Undo during loss recovery after partial ACK. */ -static bool tcp_try_undo_loss(struct sock *sk) +/* The cwnd reduction in CWR and Recovery use the PRR algorithm + * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ + * It computes the number of packets to send (sndcnt) based on packets newly + * delivered: + * 1) If the packets in flight is larger than ssthresh, PRR spreads the + * cwnd reductions across a full RTT. + * 2) If packets in flight is lower than ssthresh (such as due to excess + * losses and/or application stalls), do not perform any further cwnd + * reductions, but instead slow start up to ssthresh. + */ +static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_may_undo(tp)) { - struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - } + tp->high_seq = tp->snd_nxt; + tp->tlp_high_seq = 0; + tp->snd_cwnd_cnt = 0; + tp->prior_cwnd = tp->snd_cwnd; + tp->prr_delivered = 0; + tp->prr_out = 0; + if (set_ssthresh) + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + TCP_ECN_queue_cwr(tp); +} - tcp_clear_all_retrans_hints(tp); +static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, + int fast_rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + int newly_acked_sacked = prior_unsacked - + (tp->packets_out - tp->sacked_out); - DBGUNDO(sk, "partial loss"); - tp->lost_out = 0; - tcp_undo_cwr(sk, true); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); - inet_csk(sk)->icsk_retransmits = 0; - tp->undo_marker = 0; - if (tcp_is_sack(tp)) - tcp_set_ca_state(sk, TCP_CA_Open); - return true; + tp->prr_delivered += newly_acked_sacked; + if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + + tp->prior_cwnd - 1; + sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = min_t(int, delta, + max_t(int, tp->prr_delivered - tp->prr_out, + newly_acked_sacked) + 1); } - return false; + + sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } -static inline void tcp_complete_cwr(struct sock *sk) +static inline void tcp_end_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - /* Do not moderate cwnd if it's already undone in cwr or recovery. */ - if (tp->undo_marker) { - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_stamp = tcp_time_stamp; - } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { - /* PRR algorithm. */ - tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; - } + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || + (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } +/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tcp_init_cwnd_reduction(sk, set_ssthresh); + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + static void tcp_try_keep_open(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2932,13 +2554,13 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag) +static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) { struct tcp_sock *tp = tcp_sk(sk); tcp_verify_left_out(tp); - if (!tp->frto_counter && !tcp_any_retrans_done(sk)) + if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; if (flag & FLAG_ECE) @@ -2946,10 +2568,8 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) - tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(sk, flag); + tcp_cwnd_reduction(sk, prior_unsacked, 0); } } @@ -3031,38 +2651,6 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); -/* This function implements the PRR algorithm, specifcally the PRR-SSRB - * (proportional rate reduction with slow start reduction bound) as described in - * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt. - * It computes the number of packets to send (sndcnt) based on packets newly - * delivered: - * 1) If the packets in flight is larger than ssthresh, PRR spreads the - * cwnd reductions across a full RTT. - * 2) If packets in flight is lower than ssthresh (such as due to excess - * losses and/or application stalls), do not perform any further cwnd - * reductions, but instead slow start up to ssthresh. - */ -static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, - int fast_rexmit, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - int sndcnt = 0; - int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - - if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { - u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + - tp->prior_cwnd - 1; - sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else { - sndcnt = min_t(int, delta, - max_t(int, tp->prr_delivered - tp->prr_out, - newly_acked_sacked) + 1); - } - - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); - tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; -} - static void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); @@ -3075,26 +2663,103 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); - tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tp->retrans_out ? : -1; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); + tcp_init_cwnd_reduction(sk, true); } - - tp->bytes_acked = 0; - tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; - tp->prr_delivered = 0; - tp->prr_out = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } +/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are + * recovered or spurious. Otherwise retransmits more on partial ACKs. + */ +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool recovered = !before(tp->snd_una, tp->high_seq); + + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) + return; + + if (after(tp->snd_nxt, tp->high_seq) && + (flag & FLAG_DATA_SACKED || is_dupack)) { + tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ + } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { + tp->high_seq = tp->snd_nxt; + __tcp_push_pending_frames(sk, tcp_current_mss(sk), + TCP_NAGLE_OFF); + if (after(tp->snd_nxt, tp->high_seq)) + return; /* Step 2.b */ + tp->frto = 0; + } + } + + if (recovered) { + /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ + icsk->icsk_retransmits = 0; + tcp_try_undo_recovery(sk); + return; + } + if (flag & FLAG_DATA_ACKED) + icsk->icsk_retransmits = 0; + if (tcp_is_reno(tp)) { + /* A Reno DUPACK means new data in F-RTO step 2.b above are + * delivered. Lower inflight to clock out (re)tranmissions. + */ + if (after(tp->snd_nxt, tp->high_seq) && is_dupack) + tcp_add_reno_sack(sk); + else if (flag & FLAG_SND_UNA_ADVANCED) + tcp_reset_reno_sack(tp); + } + if (tcp_try_undo_loss(sk, false)) + return; + tcp_xmit_retransmit_queue(sk); +} + +/* Undo during fast recovery after partial ACK. */ +static bool tcp_try_undo_partial(struct sock *sk, const int acked, + const int prior_unsacked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->undo_marker && tcp_packet_delayed(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + + /* We are getting evidence that the reordering degree is higher + * than we realized. If there are no retransmits out then we + * can undo. Otherwise we clock out new packets but do not + * mark more packets lost or retransmit more. + */ + if (tp->retrans_out) { + tcp_cwnd_reduction(sk, prior_unsacked, 0); + return true; + } + + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + + DBGUNDO(sk, "partial recovery"); + tcp_undo_cwnd_reduction(sk, true); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + tcp_try_keep_open(sk); + return true; + } + return false; +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -3106,13 +2771,13 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int newly_acked_sacked, bool is_dupack, - int flag) +static void tcp_fastretrans_alert(struct sock *sk, const int acked, + const int prior_unsacked, + bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); int fast_rexmit = 0; @@ -3140,17 +2805,11 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { - case TCP_CA_Loss: - icsk->icsk_retransmits = 0; - if (tcp_try_undo_recovery(sk)) - return; - break; - case TCP_CA_CWR: /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { - tcp_complete_cwr(sk); + tcp_end_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -3160,7 +2819,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; - tcp_complete_cwr(sk); + tcp_end_cwnd_reduction(sk); break; } } @@ -3171,22 +2830,23 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else - do_lost = tcp_try_undo_partial(sk, pkts_acked); - break; - case TCP_CA_Loss: - if (flag & FLAG_DATA_ACKED) - icsk->icsk_retransmits = 0; - if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) - tcp_reset_reno_sack(tp); - if (!tcp_try_undo_loss(sk)) { - tcp_moderate_cwnd(tp); - tcp_xmit_retransmit_queue(sk); + } else { + if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + return; + /* Partial ACK arrived. Force fast retransmit. */ + do_lost = tcp_is_reno(tp) || + tcp_fackets_out(tp) > tp->reordering; + } + if (tcp_try_undo_dsack(sk)) { + tcp_try_keep_open(sk); return; } + break; + case TCP_CA_Loss: + tcp_process_loss(sk, flag, is_dupack); if (icsk->icsk_ca_state != TCP_CA_Open) return; - /* Loss is undone; fall through to processing in Open state. */ + /* Fall through to processing in Open state. */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -3199,7 +2859,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag); + tcp_try_to_open(sk, flag, prior_unsacked); return; } @@ -3219,78 +2879,70 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, fast_rexmit = 1; } - if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tp->prr_delivered += newly_acked_sacked; - tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); tcp_xmit_retransmit_queue(sk); } -void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, + long seq_rtt_us, long sack_rtt_us) { - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; -} -EXPORT_SYMBOL(tcp_valid_rtt_meas); + const struct tcp_sock *tp = tcp_sk(sk); + + /* Prefer RTT measured from ACK's timing to TS-ECR. This is because + * broken middle-boxes or peers may corrupt TS-ECR fields. But + * Karn's algorithm forbids taking RTT if some retransmitted data + * is acked (RFC6298). + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + seq_rtt_us = -1L; + + if (seq_rtt_us < 0) + seq_rtt_us = sack_rtt_us; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{ /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. - * * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> - * - * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overestimated rto. With timestamps - * samples are accepted even from very old segments: f.e., when rtt=1 - * increases to 8, we retransmit 5 times and after 8 seconds delayed - * answer arrives rto becomes 120 seconds! If at least one of segments - * in window is lost... Voila. --ANK (010210) */ - struct tcp_sock *tp = tcp_sk(sk); + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + flag & FLAG_ACKED) + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} - -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ + if (seq_rtt_us < 0) + return false; - if (flag & FLAG_RETRANS_DATA_ACKED) - return; + tcp_rtt_estimator(sk, seq_rtt_us); + tcp_set_rto(sk); - tcp_valid_rtt_meas(sk, seq_rtt); + /* RFC6298: only reset backoff on valid RTT measurement. */ + inet_csk(sk)->icsk_backoff = 0; + return true; } -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt) +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { - const struct tcp_sock *tp = tcp_sk(sk); - /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, flag); - else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, flag); + struct tcp_sock *tp = tcp_sk(sk); + long seq_rtt_us = -1L; + + if (synack_stamp && !tp->total_retrans) + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + + /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets + * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() + */ + if (!tp->srtt_us) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -3299,19 +2951,27 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ void tcp_rearm_rto(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + /* If the retrans timer is currently being used by Fast Open + * for SYN-ACK retrans purpose, stay put. + */ + if (tp->fastopen_rsk) + return; + if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (tp->early_retrans_delayed) { + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked - * when the delayed ER timer fires and is rescheduled. + * when the retrans timer fires and is rescheduled. */ if (delta > 0) rto = delta; @@ -3319,7 +2979,6 @@ void tcp_rearm_rto(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); } - tp->early_retrans_delayed = 0; } /* This function is called when the delayed ER timer fires. TCP enters @@ -3366,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una) + u32 prior_snd_una, long sack_rtt_us) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); + struct skb_mstamp first_ackt, last_ackt, now; + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; + bool fully_acked = true; + long ca_seq_rtt_us = -1L; + long seq_rtt_us = -1L; struct sk_buff *skb; - u32 now = tcp_time_stamp; - int fully_acked = true; - int flag = 0; u32 pkts_acked = 0; - u32 reord = tp->packets_out; - u32 prior_sacked = tp->sacked_out; - s32 seq_rtt = -1; - s32 ca_seq_rtt = -1; - ktime_t last_ackt = net_invalid_timestamp(); + bool rtt_update; + int flag = 0; + + first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 acked_pcount; u8 sacked = scb->sacked; + u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3405,18 +3066,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; - if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) - flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } + last_ackt = skb->skb_mstamp; + WARN_ON_ONCE(last_ackt.v64 == 0); + if (!first_ackt.v64) + first_ackt = last_ackt; + if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); + if (!after(scb->end_seq, tp->high_seq)) + flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_SACKED_ACKED) @@ -3446,7 +3105,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tp->scoreboard_skb_hint = NULL; if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = NULL; if (skb == tp->lost_skb_hint) @@ -3459,18 +3117,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; + skb_mstamp_get(&now); + if (first_ackt.v64) { + seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); + ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); + if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); } - tcp_ack_update_rtt(sk, flag, seq_rtt); - tcp_rearm_rto(sk); - if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { @@ -3487,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - s32 rtt_us = -1; - - /* Is the ACK triggering packet unambiguous? */ - if (!(flag & FLAG_RETRANS_DATA_ACKED)) { - /* High resolution needed and available? */ - if (ca_ops->flags & TCP_CONG_RTT_STAMP && - !ktime_equal(last_ackt, - net_invalid_timestamp())) - rtt_us = ktime_us_delta(ktime_get_real(), - last_ackt); - else if (ca_seq_rtt >= 0) - rtt_us = jiffies_to_usecs(ca_seq_rtt); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } + } else if (skb && rtt_update && sack_rtt_us >= 0 && + sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { + /* Do not re-arm RTO if the sack RTT is measured from data sent + * after when the head was last (re)transmitted. Otherwise the + * timeout may continue to extend in loss recovery. + */ + tcp_rearm_rto(sk); } #if FASTRETRANS_DEBUG > 0 @@ -3552,23 +3209,34 @@ static void tcp_ack_probe(struct sock *sk) } } -static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) +static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } -static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) +/* Decide wheather to run the increase function of congestion control. */ +static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - const struct tcp_sock *tp = tcp_sk(sk); - return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); + if (tcp_in_cwnd_reduction(sk)) + return false; + + /* If reordering is high then always grow cwnd whenever data is + * delivered regardless of its ordering. Otherwise stay conservative + * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ + * new SACK or ECE mark may first advance cwnd here and later reduce + * cwnd in tcp_fastretrans_alert() based on more states. + */ + if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; } /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, +static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { @@ -3617,144 +3285,75 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } -/* A very conservative spurious RTO response algorithm: reduce cwnd and - * continue in congestion avoidance. - */ -static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +/* RFC 5961 7 [ACK Throttling] */ +static void tcp_send_challenge_ack(struct sock *sk) { - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; - TCP_ECN_queue_cwr(tp); - tcp_moderate_cwnd(tp); + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 now = jiffies / HZ; + + if (now != challenge_timestamp) { + challenge_timestamp = now; + challenge_count = 0; + } + if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } } -/* A conservative spurious RTO response algorithm: reduce cwnd using - * rate halving and continue in congestion avoidance. - */ -static void tcp_ratehalving_spur_to_response(struct sock *sk) +static void tcp_store_ts_recent(struct tcp_sock *tp) { - tcp_enter_cwr(sk, 0); + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); } -static void tcp_undo_spur_to_response(struct sock *sk, int flag) +static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { - if (flag & FLAG_ECE) - tcp_ratehalving_spur_to_response(sk); - else - tcp_undo_cwr(sk, true); + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if (tcp_paws_check(&tp->rx_opt, 0)) + tcp_store_ts_recent(tp); + } } -/* F-RTO spurious RTO detection algorithm (RFC4138) - * - * F-RTO affects during two new ACKs following RTO (well, almost, see inline - * comments). State (ACK number) is kept in frto_counter. When ACK advances - * window (but not to or beyond highest sequence sent before RTO): - * On First ACK, send two new segments out. - * On Second ACK, RTO was likely spurious. Do spurious response (response - * algorithm is not part of the F-RTO detection algorithm - * given in RFC4138 but can be selected separately). - * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. F-RTO allows overriding - * of Nagle, this is done using frto_counter states 2 and 3, when a new data - * segment of any size sent during F-RTO, state 2 is upgraded to 3. - * - * Rationale: if the RTO was spurious, new ACKs should arrive from the - * original window even after we transmit two new data segments. - * - * SACK version: - * on first step, wait until first cumulative ACK arrives, then move to - * the second step. In second step, the next ACK decides. - * - * F-RTO is implemented (mainly) in four functions: - * - tcp_use_frto() is used to determine if TCP is can use F-RTO - * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is - * called when tcp_use_frto() showed green light - * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm - * - tcp_enter_frto_loss() is called if there is not enough evidence - * to prove that the RTO is indeed spurious. It transfers the control - * from F-RTO to the conventional RTO recovery +/* This routine deals with acks during a TLP episode. + * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. */ -static bool tcp_process_frto(struct sock *sk, int flag) +static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { struct tcp_sock *tp = tcp_sk(sk); + bool is_tlp_dupack = (ack == tp->tlp_high_seq) && + !(flag & (FLAG_SND_UNA_ADVANCED | + FLAG_NOT_DUP | FLAG_DATA_SACKED)); - tcp_verify_left_out(tp); - - /* Duplicate the behavior from Loss state (fastretrans_alert) */ - if (flag & FLAG_DATA_ACKED) - inet_csk(sk)->icsk_retransmits = 0; - - if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || - ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) - tp->undo_marker = 0; - - if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); - return true; - } - - if (!tcp_is_sackfrto(tp)) { - /* RFC4138 shortcoming in step 2; should also have case c): - * ACK isn't duplicate nor advances window, e.g., opposite dir - * data, winupdate - */ - if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) - return true; - - if (!(flag & FLAG_DATA_ACKED)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), - flag); - return true; - } - } else { - if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { - /* Prevent sending of new data. */ - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp)); - return true; - } - - if ((tp->frto_counter >= 2) && - (!(flag & FLAG_FORWARD_PROGRESS) || - ((flag & FLAG_DATA_SACKED) && - !(flag & FLAG_ONLY_ORIG_SACKED)))) { - /* RFC4138 shortcoming (see comment above) */ - if (!(flag & FLAG_FORWARD_PROGRESS) && - (flag & FLAG_NOT_DUP)) - return true; - - tcp_enter_frto_loss(sk, 3, flag); - return true; - } + /* Mark the end of TLP episode on receiving TLP dupack or when + * ack is after tlp_high_seq. + */ + if (is_tlp_dupack) { + tp->tlp_high_seq = 0; + return; } - if (tp->frto_counter == 1) { - /* tcp_may_send_now needs to see updated state */ - tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; - tp->frto_counter = 2; - - if (!tcp_may_send_now(sk)) - tcp_enter_frto_loss(sk, 2, flag); - - return true; - } else { - switch (sysctl_tcp_frto_response) { - case 2: - tcp_undo_spur_to_response(sk, flag); - break; - case 1: - tcp_conservative_spur_to_response(tp); - break; - default: - tcp_ratehalving_spur_to_response(sk); - break; + if (after(ack, tp->tlp_high_seq)) { + tp->tlp_high_seq = 0; + /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ + if (!(flag & FLAG_DSACKING_ACK)) { + tcp_init_cwnd_reduction(sk, true); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); + tcp_try_keep_open(sk); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBERECOVERY); } - tp->frto_counter = 0; - tp->undo_marker = 0; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); } - return false; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -3766,19 +3365,23 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight; u32 prior_fackets; - int prior_packets; - int prior_sacked = tp->sacked_out; - int pkts_acked = 0; - int newly_acked_sacked = 0; - bool frto_cwnd = false; + int prior_packets = tp->packets_out; + const int prior_unsacked = tp->packets_out - tp->sacked_out; + int acked = 0; /* Number of packets newly acked */ + long sack_rtt_us = -1L; /* If the ack is older than previous acks * then we can probably ignore it. */ - if (before(ack, prior_snd_una)) + if (before(ack, prior_snd_una)) { + /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ + if (before(ack, prior_snd_una - tp->max_window)) { + tcp_send_challenge_ack(sk); + return -1; + } goto old_ack; + } /* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). @@ -3786,23 +3389,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - if (tp->early_retrans_delayed) + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; - if (sysctl_tcp_abc) { - if (icsk->icsk_ca_state < TCP_CA_CWR) - tp->bytes_acked += ack - prior_snd_una; - else if (icsk->icsk_ca_state == TCP_CA_Loss) - /* we assume just one segment left network */ - tp->bytes_acked += min(ack - prior_snd_una, - tp->mss_cache); - } - prior_fackets = tp->fackets_out; - prior_in_flight = tcp_packets_in_flight(tp); + + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + if (flag & FLAG_UPDATE_TS_RECENT) + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. @@ -3825,7 +3425,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt_us); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3839,45 +3440,42 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; tp->rcv_tstamp = tcp_time_stamp; - prior_packets = tp->packets_out; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); - - pkts_acked = prior_packets - tp->packets_out; - newly_acked_sacked = (prior_packets - prior_sacked) - - (tp->packets_out - tp->sacked_out); + acked = tp->packets_out; + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + sack_rtt_us); + acked -= tp->packets_out; - if (tp->frto_counter) - frto_cwnd = tcp_process_frto(sk, flag); - /* Guarantee sacktag reordering detection against wrap-arounds */ - if (before(tp->frto_highmark, tp->snd_una)) - tp->frto_highmark = 0; + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, acked); if (tcp_ack_is_dubious(sk, flag)) { - /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && - tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); - } else { - if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) - tcp_cong_avoid(sk, ack, prior_in_flight); } + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) - dst_confirm(__sk_dst_get(sk)); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { + struct dst_entry *dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + } + if (icsk->icsk_pending == ICSK_TIME_RETRANS) + tcp_schedule_loss_probe(sk); + tcp_update_pacing_rate(sk); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3885,6 +3483,9 @@ no_queue: */ if (tcp_send_head(sk)) tcp_ack_probe(sk); + + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); return 1; invalid_ack: @@ -3896,9 +3497,9 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - newly_acked_sacked = tp->sacked_out - prior_sacked; - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt_us); + tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -3910,8 +3511,9 @@ old_ack: * But, this can also be called on packets in the established flow when * the fast version below fails. */ -void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, - const u8 **hvpp, int estab) +void tcp_parse_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx, int estab, + struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3994,32 +3596,24 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o */ break; #endif - case TCPOPT_COOKIE: - /* This option is variable length. + case TCPOPT_EXP: + /* Fast Open option shares code 254 using a + * 16 bits magic number. It's valid only in + * SYN or SYN-ACK with an even size. */ - switch (opsize) { - case TCPOLEN_COOKIE_BASE: - /* not yet implemented */ + if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || + get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || + foc == NULL || !th->syn || (opsize & 1)) break; - case TCPOLEN_COOKIE_PAIR: - /* not yet implemented */ - break; - case TCPOLEN_COOKIE_MIN+0: - case TCPOLEN_COOKIE_MIN+2: - case TCPOLEN_COOKIE_MIN+4: - case TCPOLEN_COOKIE_MIN+6: - case TCPOLEN_COOKIE_MAX: - /* 16-bit multiple */ - opt_rx->cookie_plus = opsize; - *hvpp = ptr; - break; - default: - /* ignore option */ - break; - } + foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; + if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && + foc->len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, ptr + 2, foc->len); + else if (foc->len != 0) + foc->len = -1; break; - } + } ptr += opsize-2; length -= opsize; } @@ -4037,7 +3631,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); + if (*ptr) + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + else + tp->rx_opt.rcv_tsecr = 0; return true; } return false; @@ -4047,8 +3644,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr * If it is wrong it falls back on tcp_parse_options(). */ static bool tcp_fast_parse_options(const struct sk_buff *skb, - const struct tcphdr *th, - struct tcp_sock *tp, const u8 **hvpp) + const struct tcphdr *th, struct tcp_sock *tp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. @@ -4061,7 +3657,11 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); + + tcp_parse_options(skb, &tp->rx_opt, 1, NULL); + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + return true; } @@ -4082,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int opcode = *ptr++; int opsize; - switch(opcode) { + switch (opcode) { case TCPOPT_EOL: return NULL; case TCPOPT_NOP: @@ -4103,27 +3703,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) EXPORT_SYMBOL(tcp_parse_md5sig_option); #endif -static inline void tcp_store_ts_recent(struct tcp_sock *tp) -{ - tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); -} - -static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) -{ - if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Not only, also it occurs for expired timestamps. - */ - - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); - } -} - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) @@ -4167,7 +3746,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(const struct sock *sk, +static inline bool tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4189,14 +3768,14 @@ static inline int tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); } /* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk) { /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { @@ -4237,6 +3816,7 @@ static void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -4248,7 +3828,9 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + dst = __sk_dst_get(sk); + if (!dst || !dst_metric(dst, RTAX_QUICKACK)) + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -4460,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ - for (i=this_sack+1; i < num_sacks; i++) + for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; @@ -4512,19 +4094,20 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, size)) { + !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, size)) { + if (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - if (!sk_rmem_schedule(sk, size)) + if (!sk_rmem_schedule(sk, skb, size)) return -1; } } @@ -4579,8 +4162,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (tcp_try_rmem_schedule(sk, skb->truesize)) { - /* TODO: should increment a counter */ + if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; } @@ -4589,6 +4172,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4614,6 +4198,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; } @@ -4642,6 +4227,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); @@ -4680,6 +4266,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb1); } @@ -4687,8 +4274,10 @@ add_sack: if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: - if (skb) + if (skb) { + tcp_grow_window(sk, skb); skb_set_owner_r(skb, sk); + } } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, @@ -4710,17 +4299,20 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; struct tcphdr *th; bool fragstolen; - if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) - goto err; + if (size == 0) + return 0; skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) goto err; + if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + goto err_free; + th = (struct tcphdr *)skb_put(skb, sizeof(*th)); skb_reset_transport_header(skb); memset(th, 0, sizeof(*th)); @@ -4791,7 +4383,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb->truesize)) + tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto drop; eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); @@ -4819,8 +4411,8 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); - else if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); return; } @@ -5110,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - /* Limited by application or receiver window. */ - u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; - } - tp->snd_cwnd_used = 0; - } - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -5168,15 +4738,7 @@ static void tcp_new_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk)) { - int sndmem = SKB_TRUESIZE(max_t(u32, - tp->rx_opt.mss_clamp, - tp->mss_cache) + - MAX_TCP_HEADER); - int demanded = max_t(unsigned int, tp->snd_cwnd, - tp->reordering + 1); - sndmem *= 2 * demanded; - if (sndmem > sk->sk_sndbuf) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tcp_sndbuf_expand(sk); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -5329,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -5372,7 +4934,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, return result; } -static inline int tcp_checksum_complete_user(struct sock *sk, +static inline bool tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && @@ -5415,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } out: return copied_early; @@ -5429,15 +4991,13 @@ out: /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) { - const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp, &hash_location) && - tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5455,38 +5015,48 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ - if (!th->rst) + if (!th->rst) { + if (th->syn) + goto syn_challenge; tcp_send_dupack(sk, skb); + } goto discard; } /* Step 2: check RST bit */ if (th->rst) { - tcp_reset(sk); + /* RFC 5961 3.2 : + * If sequence number exactly matches RCV.NXT, then + * RESET the connection + * else + * Send a challenge ACK + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + tcp_reset(sk); + else + tcp_send_challenge_ack(sk); goto discard; } - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - /* step 3: check security and precedence [ignored] */ - /* step 4: Check for a SYN in window. */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* step 4: Check for a SYN + * RFC 5691 4.2 : Send a challenge ack + */ + if (th->syn) { +syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + tcp_send_challenge_ack(sk); + goto discard; } - return 1; + return true; discard: __kfree_skb(skb); - return 0; + return false; } /* @@ -5512,12 +5082,13 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); - int res; + if (unlikely(sk->sk_rx_dst == NULL)) + inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous @@ -5589,7 +5160,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); - return 0; + return; } else { /* Header too small */ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -5602,7 +5173,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { #ifdef CONFIG_NET_DMA - if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && + tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } @@ -5638,6 +5211,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5649,9 +5225,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ @@ -5679,9 +5252,8 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - else - sk->sk_data_ready(sk, 0); - return 0; + sk->sk_data_ready(sk); + return; } } @@ -5689,16 +5261,18 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; + if (!th->ack && !th->rst) + goto discard; + /* * Standard slow path. */ - res = tcp_validate_incoming(sk, skb, th, 1); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 1)) + return; step5: - if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5711,14 +5285,14 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - return 0; + return; csum_error: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: __kfree_skb(skb); - return 0; } EXPORT_SYMBOL(tcp_rcv_established); @@ -5729,8 +5303,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) + if (skb != NULL) { + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -5760,16 +5336,62 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) } } +static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, + struct tcp_fastopen_cookie *cookie) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + u16 mss = tp->rx_opt.mss_clamp; + bool syn_drop; + + if (mss == tp->rx_opt.user_mss) { + struct tcp_options_received opt; + + /* Get original SYNACK MSS value if user MSS sets mss_clamp */ + tcp_clear_options(&opt); + opt.user_mss = opt.mss_clamp = 0; + tcp_parse_options(synack, &opt, 0, NULL); + mss = opt.mss_clamp; + } + + if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ + cookie->len = -1; + + /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably + * the remote receives only the retransmitted (regular) SYNs: either + * the original SYN-data or the corresponding SYN-ACK is lost. + */ + syn_drop = (cookie->len <= 0 && data && tp->total_retrans); + + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); + + if (data) { /* Retransmit unacked data in SYN */ + tcp_for_write_queue_from(data, sk) { + if (data == tcp_send_head(sk) || + __tcp_retransmit_skb(sk, data)) + break; + } + tcp_rearm_rto(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + return true; + } + tp->syn_data_acked = tp->syn_data; + if (tp->syn_data_acked) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + return false; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { - const u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct tcp_cookie_values *cvp = tp->cookie_values; + struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); + tcp_parse_options(skb, &tp->rx_opt, 0, &foc); + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { /* rfc793: @@ -5779,11 +5401,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: */ - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) goto reset_and_undo; if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -5825,7 +5445,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, TCP_ECN_rcv_synack(tp, th); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and @@ -5838,7 +5458,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * never scaled. */ tp->snd_wnd = ntohs(th->window); - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5867,34 +5486,14 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; - if (cvp != NULL && - cvp->cookie_pair_size > 0 && - tp->rx_opt.cookie_plus > 0) { - int cookie_size = tp->rx_opt.cookie_plus - - TCPOLEN_COOKIE_BASE; - int cookie_pair_size = cookie_size - + cvp->cookie_desired; - - /* A cookie extension option was sent and returned. - * Note that each incoming SYNACK replaces the - * Responder cookie. The initial exchange is most - * fragile, as protection against spoofing relies - * entirely upon the sequence and timestamp (above). - * This replacement strategy allows the correct pair to - * pass through, while any others will be filtered via - * Responder verification later. - */ - if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { - memcpy(&cvp->cookie_pair[cvp->cookie_desired], - hash_location, cookie_size); - cvp->cookie_pair_size = cookie_pair_size; - } - } - smp_mb(); tcp_finish_connect(sk, skb); + if ((tp->syn_fastopen || tp->syn_data) && + tcp_rcv_fastopen_synack(sk, skb, &foc)) + return -1; + if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -5972,7 +5571,9 @@ discard: tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. - * There are no obstacles to make this. + * There are no obstacles to make this (except that we must + * either change tcp_recvmsg() to prevent it from returning data + * before 3WHS completes per RFC793, or employ TCP Fast Open). * * However, if we ignore data in ACKless segments sometimes, * we have no reasons to accept it sometimes. @@ -6012,8 +5613,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *req; int queued = 0; - int res; + bool acceptable; + u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -6068,118 +5671,167 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - res = tcp_validate_incoming(sk, skb, th, 0); - if (res <= 0) - return -res; + req = tp->fastopen_rsk; + if (req != NULL) { + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && + sk->sk_state != TCP_FIN_WAIT1); + + if (tcp_check_req(sk, skb, req, NULL, true) == NULL) + goto discard; + } + + if (!th->ack && !th->rst) + goto discard; + + if (!tcp_validate_incoming(sk, skb, th, 0)) + return 0; /* step 5: check the ACK field */ - if (th->ack) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; - - switch (sk->sk_state) { - case TCP_SYN_RECV: - if (acceptable) { - tp->copied_seq = tp->rcv_nxt; - smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); - - /* Note, that this wakeup is only for marginal - * crossed SYN case. Passively open sockets - * are not waked up, because sk->sk_sleep == - * NULL and sk->sk_socket == NULL. - */ - if (sk->sk_socket) - sk_wake_async(sk, - SOCK_WAKE_IO, POLL_OUT); + acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = ntohs(th->window) << - tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + switch (sk->sk_state) { + case TCP_SYN_RECV: + if (!acceptable) + return 1; - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + /* Once we leave TCP_SYN_RECV, we no longer need req + * so release it. + */ + if (req) { + synack_stamp = tcp_rsk(req)->snt_synack; + tp->total_retrans = req->num_retrans; + reqsk_fastopen_remove(sk, req, false); + } else { + synack_stamp = tp->lsndtime; + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_congestion_control(sk); + + tcp_mtup_init(sk); + tp->copied_seq = tp->rcv_nxt; + tcp_init_buffer_space(sk); + } + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); - /* Make sure socket is routed, for - * correct metrics. - */ - icsk->icsk_af_ops->rebuild_header(sk); + /* Note, that this wakeup is only for marginal crossed SYN case. + * Passively open sockets are not waked up, because + * sk->sk_sleep == NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - tcp_init_metrics(sk); + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_synack_rtt_meas(sk, synack_stamp); - tcp_init_congestion_control(sk); + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - /* Prevent spurious tcp_cwnd_restart() on - * first data packet. - */ - tp->lsndtime = tcp_time_stamp; + if (req) { + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); + } else + tcp_init_metrics(sk); - tcp_mtup_init(sk); - tcp_initialize_rcv_mss(sk); - tcp_init_buffer_space(sk); - tcp_fast_path_on(tp); - } else { + tcp_update_pacing_rate(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data packet */ + tp->lsndtime = tcp_time_stamp; + + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); + break; + + case TCP_FIN_WAIT1: { + struct dst_entry *dst; + int tmo; + + /* If we enter the TCP_FIN_WAIT1 state and we are a + * Fast Open socket and this is the first acceptable + * ACK we have received, this would have acknowledged + * our SYNACK so stop the SYNACK timer. + */ + if (req != NULL) { + /* Return RST if ack_seq is invalid. + * Note that RFC793 only says to generate a + * DUPACK for it but for TCP Fast Open it seems + * better to treat this case like TCP_SYN_RECV + * above. + */ + if (!acceptable) return 1; - } + /* We no longer need the request sock. */ + reqsk_fastopen_remove(sk, req, false); + tcp_rearm_rto(sk); + } + if (tp->snd_una != tp->write_seq) break; - case TCP_FIN_WAIT1: - if (tp->snd_una == tp->write_seq) { - tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; - dst_confirm(__sk_dst_get(sk)); - - if (!sock_flag(sk, SOCK_DEAD)) - /* Wake up lingering close() */ - sk->sk_state_change(sk); - else { - int tmo; - - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { - tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; - } + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; - tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { - /* Bad case. We could lose such FIN otherwise. - * It is not a big problem, but it looks confusing - * and not so rare event. We still can lose it now, - * if it spins in bh_lock_sock(), but it is really - * marginal case. - */ - inet_csk_reset_keepalive_timer(sk, tmo); - } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; - } - } - } - break; + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk, TCP_TIME_WAIT, 0); - goto discard; - } + if (!sock_flag(sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + sk->sk_state_change(sk); break; + } - case TCP_LAST_ACK: - if (tp->snd_una == tp->write_seq) { - tcp_update_metrics(sk); - tcp_done(sk); - goto discard; - } - break; + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; } - } else - goto discard; + + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + break; + } + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; + } /* step 6: check the URG bit */ tcp_urg(sk, skb, th); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c8d28c433b2..77cccda1ad0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include <net/netdma.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> +#include <net/busy_poll.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -138,14 +139,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); -static int tcp_repair_connect(struct sock *sk) -{ - tcp_connect_init(sk); - tcp_finish_connect(sk, NULL); - - return 0; -} - /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -180,11 +173,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - orig_sport, orig_dport, sk, true); + orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) - IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } @@ -209,22 +202,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (tcp_death_row.sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { - struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) + tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; @@ -264,10 +243,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_id = tp->write_seq ^ jiffies; - if (likely(!tp->repair)) - err = tcp_connect(sk); - else - err = tcp_repair_connect(sk); + err = tcp_connect(sk); rt = NULL; if (err) @@ -289,31 +265,20 @@ failure: EXPORT_SYMBOL(tcp_v4_connect); /* - * This routine does path mtu discovery as defined in RFC1191. + * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. + * It can be called through tcp_release_cb() if socket was owned by user + * at the time tcp_v4_err() was called to handle ICMP message. */ -static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) +static void tcp_v4_mtu_reduced(struct sock *sk) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); + u32 mtu = tcp_sk(sk)->mtu_info; - /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs - * send out by Linux are always <576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == TCP_LISTEN) - return; - - /* We don't check in the destentry if pmtu discovery is forbidden - * on this route. We just assume that no packet_to_big packets - * are send back when pmtu discovery is not active. - * There is a small race when the user changes this flag in the - * route, but I think that's acceptable. - */ - if ((dst = __sk_dst_check(sk, 0)) == NULL) + dst = inet_csk_update_pmtu(sk, mtu); + if (!dst) return; - dst->ops->update_pmtu(dst, mtu); - /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ @@ -323,6 +288,7 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && + ip_sk_accept_pmtu(sk) && inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); @@ -335,6 +301,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) } /* else let the usual retransmit timer handle it */ } +static void do_redirect(struct sk_buff *skb, struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + + if (dst) + dst->ops->redirect(dst, sk, skb); +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -362,7 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; __u32 remaining; int err; struct net *net = dev_net(icmp_skb->dev); @@ -386,10 +361,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. + * We do take care of PMTU discovery (RFC1191) special case : + * we can receive locally generated ICMP messages while socket is held. */ - if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); - + if (sock_owned_by_user(sk)) { + if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + } if (sk->sk_state == TCP_CLOSE) goto out; @@ -401,13 +379,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt)) { + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } switch (type) { + case ICMP_REDIRECT: + do_redirect(icmp_skb, sk); + goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; @@ -419,8 +403,20 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - if (!sock_owned_by_user(sk)) - do_pmtu_discovery(sk, iph, info); + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + + tp->mtu_info = info; + if (!sock_owned_by_user(sk)) { + tcp_v4_mtu_reduced(sk); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + sock_hold(sk); + } goto out; } @@ -430,14 +426,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) break; if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff) + !icsk->icsk_backoff || fastopen) break; if (sock_owned_by_user(sk)) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : + inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); @@ -492,12 +488,17 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) * errors returned from accept(). */ inet_csk_reqsk_queue_drop(sk, req, prev); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can f.e. if SYNs crossed. - */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; @@ -539,8 +540,7 @@ out: sock_put(sk); } -static void __tcp_v4_send_check(struct sk_buff *skb, - __be32 saddr, __be32 daddr) +void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); @@ -565,23 +565,6 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_v4_send_check); -int tcp_v4_gso_send_check(struct sk_buff *skb) -{ - const struct iphdr *iph; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - iph = ip_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - skb->ip_summed = CHECKSUM_PARTIAL; - __tcp_v4_send_check(skb, iph->saddr, iph->daddr); - return 0; -} - /* * This routine will send an RST to the other tcp. * @@ -651,7 +634,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, ip_hdr(skb)->daddr, + &tcp_hashinfo, ip_hdr(skb)->saddr, + th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ if (!sk1) @@ -691,15 +675,16 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; /* When socket is gone, all binding information is lost. - * routing might fail in this case. using iif for oif to - * make sure we can deliver it + * routing might fail in this case. No choice here, if we choose to force + * input interface, we will misroute in case of asymmetric route. */ - arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); + if (sk) + arg.bound_dev_if = sk->sk_bound_dev_if; net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, - &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -718,7 +703,7 @@ release_sk1: */ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 ts, int oif, + u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) { @@ -739,12 +724,12 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - if (ts) { + if (tsecr) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - rep.opt[1] = htonl(tcp_time_stamp); - rep.opt[2] = htonl(ts); + rep.opt[1] = htonl(tsval); + rep.opt[2] = htonl(tsecr); arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; } @@ -759,7 +744,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, #ifdef CONFIG_TCP_MD5SIG if (key) { - int offset = (ts) ? 3 : 0; + int offset = (tsecr) ? 3 : 0; rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -781,8 +766,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, - &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } @@ -794,6 +779,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -807,8 +793,13 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, - tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ + tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_time_stamp, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -824,39 +815,44 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp, - u16 queue_mapping) + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; - struct sk_buff * skb; + struct sk_buff *skb; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, rvp); + skb = tcp_make_synack(sk, dst, req, foc); if (skb) { - __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); + __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); skb_set_queue_mapping(skb, queue_mapping); - err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, - ireq->rmt_addr, + err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, + ireq->ir_rmt_addr, ireq->opt); err = net_xmit_eval(err); + if (!tcp_rsk(req)->snt_synack && !err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; } - dst_release(dst); return err; } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, - struct request_values *rvp) +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v4_send_synack(sk, NULL, req, rvp, 0); + int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); + + if (!res) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } + return res; } /* @@ -878,8 +874,6 @@ bool tcp_syn_flood_action(struct sock *sk, bool want_cookie = false; struct listen_sock *lopt; - - #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { msg = "Sending cookies"; @@ -890,7 +884,7 @@ bool tcp_syn_flood_action(struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned) { + if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { lopt->synflood_warned = 1; pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -902,8 +896,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. */ -static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { const struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options_rcu *dopt = NULL; @@ -936,7 +929,6 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; - struct hlist_node *pos; unsigned int size = sizeof(struct in_addr); struct tcp_md5sig_info *md5sig; @@ -950,7 +942,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, if (family == AF_INET6) size = sizeof(struct in6_addr); #endif - hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) { + hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; if (!memcmp(&key->addr, addr, size)) @@ -975,7 +967,7 @@ static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, { union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr; + addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr; return tcp_md5_do_lookup(sk, addr, AF_INET); } @@ -988,7 +980,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET); + key = tcp_md5_do_lookup(sk, addr, family); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -1011,7 +1003,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = sock_kmalloc(sk, sizeof(*key), gfp); if (!key) return -ENOMEM; - if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) { + if (!tcp_alloc_md5sig_pool()) { sock_kfree_s(sk, key, sizeof(*key)); return -ENOMEM; } @@ -1029,36 +1021,28 @@ EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) { - struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; - struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET); + key = tcp_md5_do_lookup(sk, addr, family); if (!key) return -ENOENT; hlist_del_rcu(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); kfree_rcu(key, rcu); - md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); - if (hlist_empty(&md5sig->head)) - tcp_free_md5sig_pool(); return 0; } EXPORT_SYMBOL(tcp_md5_do_del); -void tcp_clear_md5_list(struct sock *sk) +static void tcp_clear_md5_list(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; - struct hlist_node *pos, *n; + struct hlist_node *n; struct tcp_md5sig_info *md5sig; md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - if (!hlist_empty(&md5sig->head)) - tcp_free_md5sig_pool(); - hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) { + hlist_for_each_entry_safe(key, n, &md5sig->head, node) { hlist_del_rcu(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); kfree_rcu(key, rcu); @@ -1160,8 +1144,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, saddr = inet_sk(sk)->inet_saddr; daddr = inet_sk(sk)->inet_daddr; } else if (req) { - saddr = inet_rsk(req)->loc_addr; - daddr = inet_rsk(req)->rmt_addr; + saddr = inet_rsk(req)->ir_loc_addr; + daddr = inet_rsk(req)->ir_rmt_addr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; @@ -1272,9 +1256,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; - const u8 *hash_location; struct request_sock *req; struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); @@ -1282,7 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false; + bool want_cookie = false, fastopen; + struct flowi4 fl4; + struct tcp_fastopen_cookie foc = { .len = -1 }; + int err; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1292,7 +1277,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * limitations, they conserve resources and peer is * evidently real one. */ - if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); if (!want_cookie) goto drop; @@ -1303,8 +1289,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * clogging syn queue with openreqs with exponentially increasing * timeout. */ - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; + } req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) @@ -1317,41 +1305,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); - - if (tmp_opt.cookie_plus > 0 && - tmp_opt.saw_tstamp && - !tp->rx_opt.cookie_out_never && - (sysctl_tcp_cookie_size > 0 || - (tp->cookie_values != NULL && - tp->cookie_values->cookie_desired > 0))) { - u8 *c; - u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; - int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; - - if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) - goto drop_and_release; - - /* Secret recipe starts with IP addresses */ - *mess++ ^= (__force u32)daddr; - *mess++ ^= (__force u32)saddr; - - /* plus variable length Initiator Cookie */ - c = (u8 *)mess; - while (l-- > 0) - *c++ ^= *hash_location++; - - want_cookie = false; /* not our kind of cookie */ - tmp_ext.cookie_out_never = 0; /* false */ - tmp_ext.cookie_plus = tmp_opt.cookie_plus; - } else if (!tp->rx_opt.cookie_in_always) { - /* redundant indications, but ensure initialization. */ - tmp_ext.cookie_out_never = 1; /* true */ - tmp_ext.cookie_plus = 0; - } else { - goto drop_and_release; - } - tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1360,24 +1314,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); - ireq->loc_addr = daddr; - ireq->rmt_addr = saddr; + ireq->ir_loc_addr = daddr; + ireq->ir_rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(sk, skb); + ireq->opt = tcp_v4_save_options(skb); + ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb); + TCP_ECN_create_request(req, skb, sock_net(sk)); if (want_cookie) { isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { - struct inet_peer *peer = NULL; - struct flowi4 fl4; - /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -1390,12 +1342,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && - fl4.daddr == saddr && - (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + fl4.daddr == saddr) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1404,8 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -1420,16 +1367,25 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) + goto drop_and_free; + tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v4_send_synack(sk, dst, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; - if (tcp_v4_send_synack(sk, dst, req, - (struct request_values *)&tmp_ext, - skb_get_queue_mapping(skb)) || - want_cookie) - goto drop_and_free; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->listener = NULL; + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop_and_release: @@ -1437,6 +1393,7 @@ drop_and_release: drop_and_free: reqsk_free(req); drop: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; } EXPORT_SYMBOL(tcp_v4_conn_request); @@ -1467,13 +1424,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; + inet_sk_rx_dst_set(newsk, skb); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->rmt_addr; - newinet->inet_rcv_saddr = ireq->loc_addr; - newinet->inet_saddr = ireq->loc_addr; + newinet->inet_daddr = ireq->ir_rmt_addr; + newinet->inet_rcv_saddr = ireq->ir_loc_addr; + newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); ireq->opt = NULL; @@ -1494,7 +1452,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } sk_setup_caps(newsk, dst); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && @@ -1502,10 +1459,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - if (tcp_rsk(req)->snt_synack) - tcp_valid_rtt_meas(newsk, - tcp_time_stamp - tcp_rsk(req)->snt_synack); - newtp->total_retrans = req->retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ @@ -1538,10 +1491,8 @@ exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: - tcp_clear_xmit_timers(newsk); - tcp_cleanup_congestion_control(newsk); - bh_unlock_sock(newsk); - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); goto exit; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); @@ -1556,7 +1507,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); if (req) - return tcp_check_req(sk, skb, req, prev); + return tcp_check_req(sk, skb, req, prev, false); nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -1577,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v4_check(skb->len, iph->saddr, - iph->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, IPPROTO_TCP, 0); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - - /* The socket must have it's spinlock held when we get * here. * @@ -1622,11 +1551,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { - rsk = sk; - goto reset; + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } } + tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); return 0; } @@ -1667,11 +1602,94 @@ discard: return 0; csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); +void tcp_v4_early_demux(struct sk_buff *skb) +{ + const struct iphdr *iph; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) + return; + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + iph->saddr, th->source, + iph->daddr, ntohs(th->dest), + skb->skb_iif); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + + if (dst) + dst = dst_check(dst, 0); + if (dst && + inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + skb_dst_set_noref(skb, dst); + } + } +} + +/* Packet is added to VJ-style prequeue for processing in process + * context, if a reader task is waiting. Apparently, this exciting + * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) + * failed somewhere. Latency? Burstiness? Well, at least now we will + * see, why it failed. 8)8) --ANK + * + */ +bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (sysctl_tcp_low_latency || !tp->ucopy.task) + return false; + + if (skb->len <= tcp_hdrlen(skb) && + skb_queue_len(&tp->ucopy.prequeue) == 0) + return false; + + skb_dst_force(skb); + __skb_queue_tail(&tp->ucopy.prequeue, skb); + tp->ucopy.memory += skb->truesize; + if (tp->ucopy.memory > sk->sk_rcvbuf) { + struct sk_buff *skb1; + + BUG_ON(sock_owned_by_user(sk)); + + while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { + sk_backlog_rcv(sk, skb1); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPPREQUEUEDROPPED); + } + + tp->ucopy.memory = 0; + } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { + wake_up_interruptible_sync_poll(sk_sleep(sk), + POLLIN | POLLRDNORM | POLLRDBAND); + if (!inet_csk_ack_scheduled(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * tcp_rto_min(sk)) / 4, + TCP_RTO_MAX); + } + return true; +} +EXPORT_SYMBOL(tcp_prequeue); + /* * From tcp_input.c */ @@ -1704,8 +1722,9 @@ int tcp_v4_rcv(struct sk_buff *skb) * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ - if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) - goto bad_packet; + + if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) + goto csum_error; th = tcp_hdr(skb); iph = ip_hdr(skb); @@ -1737,6 +1756,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); @@ -1771,6 +1791,8 @@ no_tcp_socket: goto discard_it; if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { +csum_error: + TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { @@ -1792,15 +1814,19 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + if (skb->len < (th->doff << 2)) { inet_twsk_put(inet_twsk(sk)); - goto discard_it; + goto bad_packet; + } + if (tcp_checksum_complete(skb)) { + inet_twsk_put(inet_twsk(sk)); + goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (sk2) { @@ -1821,49 +1847,29 @@ do_time_wait: goto discard_it; } -struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) -{ - struct rtable *rt = (struct rtable *) __sk_dst_get(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_peer *peer; - - if (!rt || - inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { - peer = inet_getpeer_v4(inet->inet_daddr, 1); - *release_it = true; - } else { - if (!rt->peer) - rt_bind_peer(rt, inet->inet_daddr, 1); - peer = rt->peer; - *release_it = false; - } - - return peer; -} -EXPORT_SYMBOL(tcp_v4_get_peer); - -void *tcp_v4_tw_get_peer(struct sock *sk) -{ - const struct inet_timewait_sock *tw = inet_twsk(sk); - - return inet_getpeer_v4(tw->tw_daddr, 1); -} -EXPORT_SYMBOL(tcp_v4_tw_get_peer); - static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, - .twsk_getpeer = tcp_v4_tw_get_peer, }; +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(inet_sk_rx_dst_set); + const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, @@ -1938,20 +1944,10 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - /* - * If sendmsg cached page exists, toss it. - */ - if (sk->sk_sndmsg_page) { - __free_page(sk->sk_sndmsg_page); - sk->sk_sndmsg_page = NULL; - } + BUG_ON(tp->fastopen_rsk != NULL); - /* TCP Cookie Transactions */ - if (tp->cookie_values != NULL) { - kref_put(&tp->cookie_values->kref, - tcp_cookie_values_release); - tp->cookie_values = NULL; - } + /* If socket is aborted during connect operation */ + tcp_free_fastopen_req(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); @@ -1961,18 +1957,6 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) -{ - return hlist_nulls_empty(head) ? NULL : - list_entry(head->first, struct inet_timewait_sock, tw_node); -} - -static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) -{ - return !is_a_nulls(tw->tw_node.next) ? - hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; -} - /* * Get next listener socket follow cur. If cur is NULL, get first socket * starting from bucket given in st->bucket; when st->bucket is zero the @@ -2076,10 +2060,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } -static inline bool empty_bucket(struct tcp_iter_state *st) +static inline bool empty_bucket(const struct tcp_iter_state *st) { - return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && - hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); + return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); } /* @@ -2096,7 +2079,6 @@ static void *established_get_first(struct seq_file *seq) for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; - struct inet_timewait_sock *tw; spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); /* Lockless fast path for the common case of empty buckets */ @@ -2112,18 +2094,7 @@ static void *established_get_first(struct seq_file *seq) rc = sk; goto out; } - st->state = TCP_SEQ_STATE_TIME_WAIT; - inet_twsk_for_each(tw, node, - &tcp_hashinfo.ehash[st->bucket].twchain) { - if (tw->tw_family != st->family || - !net_eq(twsk_net(tw), net)) { - continue; - } - rc = tw; - goto out; - } spin_unlock_bh(lock); - st->state = TCP_SEQ_STATE_ESTABLISHED; } out: return rc; @@ -2132,7 +2103,6 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; - struct inet_timewait_sock *tw; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); @@ -2140,45 +2110,16 @@ static void *established_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_TIME_WAIT) { - tw = cur; - tw = tw_next(tw); -get_tw: - while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { - tw = tw_next(tw); - } - if (tw) { - cur = tw; - goto out; - } - spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - st->state = TCP_SEQ_STATE_ESTABLISHED; - - /* Look for next non empty bucket */ - st->offset = 0; - while (++st->bucket <= tcp_hashinfo.ehash_mask && - empty_bucket(st)) - ; - if (st->bucket > tcp_hashinfo.ehash_mask) - return NULL; - - spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); - } else - sk = sk_nulls_next(sk); + sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) - goto found; + return sk; } - st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); - goto get_tw; -found: - cur = sk; -out: - return cur; + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + ++st->bucket; + return established_get_first(seq); } static void *established_get_idx(struct seq_file *seq, loff_t pos) @@ -2231,10 +2172,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (rc) break; st->bucket = 0; + st->state = TCP_SEQ_STATE_ESTABLISHED; /* Fallthrough */ case TCP_SEQ_STATE_ESTABLISHED: - case TCP_SEQ_STATE_TIME_WAIT: - st->state = TCP_SEQ_STATE_ESTABLISHED; if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); @@ -2291,7 +2231,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } break; case TCP_SEQ_STATE_ESTABLISHED: - case TCP_SEQ_STATE_TIME_WAIT: rc = established_get_next(seq, v); break; } @@ -2315,7 +2254,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); break; - case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); @@ -2325,7 +2263,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) int tcp_seq_open(struct inode *inode, struct file *file) { - struct tcp_seq_afinfo *afinfo = PDE(inode)->data; + struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); struct tcp_iter_state *s; int err; @@ -2360,50 +2298,52 @@ EXPORT_SYMBOL(tcp_proc_register); void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) { - proc_net_remove(net, afinfo->name); + remove_proc_entry(afinfo->name, net->proc_net); } EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct sock *sk, const struct request_sock *req, - struct seq_file *f, int i, int uid, int *len) + struct seq_file *f, int i, kuid_t uid) { const struct inet_request_sock *ireq = inet_rsk(req); - int ttd = req->expires - jiffies; + long delta = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", i, - ireq->loc_addr, + ireq->ir_loc_addr, ntohs(inet_sk(sk)->inet_sport), - ireq->rmt_addr, - ntohs(ireq->rmt_port), + ireq->ir_rmt_addr, + ntohs(ireq->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ - jiffies_to_clock_t(ttd), - req->retrans, - uid, + jiffies_delta_to_clock_t(delta), + req->num_timeout, + from_kuid_munged(seq_user_ns(f), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), - req, - len); + req); } -static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) +static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) { int timer_active; unsigned long timer_expires; const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); + struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; - if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS || + icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { @@ -2426,14 +2366,14 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", + "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, timer_active, - jiffies_to_clock_t(timer_expires - jiffies), + jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sk), + from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, @@ -2441,19 +2381,17 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, - len); + sk->sk_state == TCP_LISTEN ? + (fastopenq ? fastopenq->max_qlen : 0) : + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } static void get_timewait4_sock(const struct inet_timewait_sock *tw, - struct seq_file *f, int i, int *len) + struct seq_file *f, int i) { __be32 dest, src; __u16 destp, srcp; - int ttd = tw->tw_ttd - jiffies; - - if (ttd < 0) - ttd = 0; + s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; @@ -2461,10 +2399,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, - 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, - atomic_read(&tw->tw_refcnt), tw, len); + 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); } #define TMPSZ 150 @@ -2472,11 +2410,11 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, static int tcp4_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; - int len; + struct sock *sk = v; + seq_setwidth(seq, TMPSZ - 1); if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-*s\n", TMPSZ - 1, - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode"); goto out; @@ -2486,17 +2424,17 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_ESTABLISHED: - get_tcp4_sock(v, seq, st->num, &len); + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else + get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); - break; - case TCP_SEQ_STATE_TIME_WAIT: - get_timewait4_sock(v, seq, st->num, &len); + get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); break; } - seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); out: + seq_pad(seq, '\n'); return 0; } @@ -2543,39 +2481,6 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - const struct iphdr *iph = skb_gro_network_header(skb); - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - - /* fall through */ - case CHECKSUM_NONE: - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } - - return tcp_gro_receive(head, skb); -} - -int tcp4_gro_complete(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - struct tcphdr *th = tcp_hdr(skb); - - th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), - iph->saddr, iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - - return tcp_gro_complete(skb); -} - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -2593,14 +2498,18 @@ struct proto tcp_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, @@ -2614,7 +2523,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM .init_cgroup = tcp_init_cgroup, .destroy_cgroup = tcp_destroy_cgroup, .proto_cgroup = tcp_proto_cgroup, @@ -2624,13 +2533,12 @@ EXPORT_SYMBOL(tcp_prot); static int __net_init tcp_sk_init(struct net *net) { - return inet_ctl_sock_create(&net->ipv4.tcp_sock, - PF_INET, SOCK_RAW, IPPROTO_TCP, net); + net->ipv4.sysctl_tcp_ecn = 2; + return 0; } static void __net_exit tcp_sk_exit(struct net *net) { - inet_ctl_sock_destroy(net->ipv4.tcp_sock); } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 72f7218b03f..1e70fa8fa79 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } /** @@ -314,11 +314,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) } static struct tcp_congestion_ops tcp_lp __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index b6f3583ddfe..f7a2ec3ac58 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,18 +6,6 @@ #include <linux/memcontrol.h> #include <linux/module.h> -static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) -{ - return container_of(cg_proto, struct tcp_memcontrol, cg_proto); -} - -static void memcg_tcp_enter_memory_pressure(struct sock *sk) -{ - if (sk->sk_cgrp->memory_pressure) - *sk->sk_cgrp->memory_pressure = 1; -} -EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); - int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { /* @@ -27,34 +15,24 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) */ struct res_counter *res_parent = NULL; struct cg_proto *cg_proto, *parent_cg; - struct tcp_memcontrol *tcp; struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct net *net = current->nsproxy->net_ns; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; - tcp = tcp_from_cgproto(cg_proto); - - tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; - tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; - tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; - tcp->tcp_memory_pressure = 0; + cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; + cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; + cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; + cg_proto->memory_pressure = 0; + cg_proto->memcg = memcg; parent_cg = tcp_prot.proto_cgroup(parent); if (parent_cg) - res_parent = parent_cg->memory_allocated; - - res_counter_init(&tcp->tcp_memory_allocated, res_parent); - percpu_counter_init(&tcp->tcp_sockets_allocated, 0); + res_parent = &parent_cg->memory_allocated; - cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; - cg_proto->memory_pressure = &tcp->tcp_memory_pressure; - cg_proto->sysctl_mem = tcp->tcp_prot_mem; - cg_proto->memory_allocated = &tcp->tcp_memory_allocated; - cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; - cg_proto->memcg = memcg; + res_counter_init(&cg_proto->memory_allocated, res_parent); + percpu_counter_init(&cg_proto->sockets_allocated, 0); return 0; } @@ -63,26 +41,18 @@ EXPORT_SYMBOL(tcp_init_cgroup); void tcp_destroy_cgroup(struct mem_cgroup *memcg) { struct cg_proto *cg_proto; - struct tcp_memcontrol *tcp; - u64 val; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return; - tcp = tcp_from_cgproto(cg_proto); - percpu_counter_destroy(&tcp->tcp_sockets_allocated); - - val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); + percpu_counter_destroy(&cg_proto->sockets_allocated); } EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) { - struct net *net = current->nsproxy->net_ns; - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; - u64 old_lim; int i; int ret; @@ -90,23 +60,20 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) if (!cg_proto) return -EINVAL; - if (val > RESOURCE_MAX) - val = RESOURCE_MAX; + if (val > RES_COUNTER_MAX) + val = RES_COUNTER_MAX; - tcp = tcp_from_cgproto(cg_proto); - - old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); - ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); + ret = res_counter_set_limit(&cg_proto->memory_allocated, val); if (ret) return ret; for (i = 0; i < 3; i++) - tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, - net->ipv4.sysctl_tcp_mem[i]); + cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, + sysctl_tcp_mem[i]); - if (val == RESOURCE_MAX) + if (val == RES_COUNTER_MAX) clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else if (val != RESOURCE_MAX) { + else if (val != RES_COUNTER_MAX) { /* * The active bit needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -135,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, - const char *buffer) +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned long long val; int ret = 0; - switch (cft->private) { + buf = strstrip(buf); + + switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = tcp_update_limit(memcg, val); @@ -154,43 +123,39 @@ static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) { - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return default_val; - tcp = tcp_from_cgproto(cg_proto); - return res_counter_read_u64(&tcp->tcp_memory_allocated, type); + return res_counter_read_u64(&cg_proto->memory_allocated, type); } static u64 tcp_read_usage(struct mem_cgroup *memcg) { - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; - tcp = tcp_from_cgproto(cg_proto); - return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); } -static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) +static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); u64 val; switch (cft->private) { case RES_LIMIT: - val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); + val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); break; case RES_USAGE: val = tcp_read_usage(memcg); @@ -205,61 +170,33 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_cont(cont); + memcg = mem_cgroup_from_css(of_css(of)); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; - tcp = tcp_from_cgproto(cg_proto); + return nbytes; - switch (event) { + switch (of_cft(of)->private) { case RES_MAX_USAGE: - res_counter_reset_max(&tcp->tcp_memory_allocated); + res_counter_reset_max(&cg_proto->memory_allocated); break; case RES_FAILCNT: - res_counter_reset_failcnt(&tcp->tcp_memory_allocated); + res_counter_reset_failcnt(&cg_proto->memory_allocated); break; } - return 0; -} - -unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) -{ - struct tcp_memcontrol *tcp; - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); - if (!cg_proto) - return 0; - - tcp = tcp_from_cgproto(cg_proto); - return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); -} - -void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) -{ - struct tcp_memcontrol *tcp; - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return; - - tcp = tcp_from_cgproto(cg_proto); - - tcp->tcp_prot_mem[idx] = val; + return nbytes; } static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", - .write_string = tcp_cgroup_write, + .write = tcp_cgroup_write, .read_u64 = tcp_cgroup_read, .private = RES_LIMIT, }, @@ -271,13 +208,13 @@ static struct cftype tcp_files[] = { { .name = "kmem.tcp.failcnt", .private = RES_FAILCNT, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { .name = "kmem.tcp.max_usage_in_bytes", .private = RES_MAX_USAGE, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { } /* terminate */ @@ -285,7 +222,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 00000000000..4fe04180598 --- /dev/null +++ b/net/ipv4/tcp_metrics.c @@ -0,0 +1,1188 @@ +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/tcp.h> +#include <linux/hash.h> +#include <linux/tcp_metrics.h> +#include <linux/vmalloc.h> + +#include <net/inet_connection_sock.h> +#include <net/net_namespace.h> +#include <net/request_sock.h> +#include <net/inetpeer.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/dst.h> +#include <net/tcp.h> +#include <net/genetlink.h> + +int sysctl_tcp_nometrics_save __read_mostly; + +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, + struct net *net, unsigned int hash); + +struct tcp_fastopen_metrics { + u16 mss; + u16 syn_loss:10; /* Recurring Fast Open SYN losses */ + unsigned long last_syn_loss; /* Last Fast Open SYN loss */ + struct tcp_fastopen_cookie cookie; +}; + +/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility + * Kernel only stores RTT and RTTVAR in usec resolution + */ +#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) + +struct tcp_metrics_block { + struct tcp_metrics_block __rcu *tcpm_next; + struct inetpeer_addr tcpm_saddr; + struct inetpeer_addr tcpm_daddr; + unsigned long tcpm_stamp; + u32 tcpm_ts; + u32 tcpm_ts_stamp; + u32 tcpm_lock; + u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; + struct tcp_fastopen_metrics tcpm_fastopen; + + struct rcu_head rcu_head; +}; + +static bool tcp_metric_locked(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return tm->tcpm_lock & (1 << idx); +} + +static u32 tcp_metric_get(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return tm->tcpm_vals[idx]; +} + +static void tcp_metric_set(struct tcp_metrics_block *tm, + enum tcp_metric_index idx, + u32 val) +{ + tm->tcpm_vals[idx] = val; +} + +static bool addr_same(const struct inetpeer_addr *a, + const struct inetpeer_addr *b) +{ + const struct in6_addr *a6, *b6; + + if (a->family != b->family) + return false; + if (a->family == AF_INET) + return a->addr.a4 == b->addr.a4; + + a6 = (const struct in6_addr *) &a->addr.a6[0]; + b6 = (const struct in6_addr *) &b->addr.a6[0]; + + return ipv6_addr_equal(a6, b6); +} + +struct tcpm_hash_bucket { + struct tcp_metrics_block __rcu *chain; +}; + +static DEFINE_SPINLOCK(tcp_metrics_lock); + +static void tcpm_suck_dst(struct tcp_metrics_block *tm, + const struct dst_entry *dst, + bool fastopen_clear) +{ + u32 msval; + u32 val; + + tm->tcpm_stamp = jiffies; + + val = 0; + if (dst_metric_locked(dst, RTAX_RTT)) + val |= 1 << TCP_METRIC_RTT; + if (dst_metric_locked(dst, RTAX_RTTVAR)) + val |= 1 << TCP_METRIC_RTTVAR; + if (dst_metric_locked(dst, RTAX_SSTHRESH)) + val |= 1 << TCP_METRIC_SSTHRESH; + if (dst_metric_locked(dst, RTAX_CWND)) + val |= 1 << TCP_METRIC_CWND; + if (dst_metric_locked(dst, RTAX_REORDERING)) + val |= 1 << TCP_METRIC_REORDERING; + tm->tcpm_lock = val; + + msval = dst_metric_raw(dst, RTAX_RTT); + tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + + msval = dst_metric_raw(dst, RTAX_RTTVAR); + tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; + tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); + tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); + tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tm->tcpm_ts = 0; + tm->tcpm_ts_stamp = 0; + if (fastopen_clear) { + tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.cookie.len = 0; + } +} + +#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + tcpm_suck_dst(tm, dst, false); +} + +#define TCP_METRICS_RECLAIM_DEPTH 5 +#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL + +static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, + struct inetpeer_addr *saddr, + struct inetpeer_addr *daddr, + unsigned int hash) +{ + struct tcp_metrics_block *tm; + struct net *net; + bool reclaim = false; + + spin_lock_bh(&tcp_metrics_lock); + net = dev_net(dst->dev); + + /* While waiting for the spin-lock the cache might have been populated + * with this entry and so we have to check again. + */ + tm = __tcp_get_metrics(saddr, daddr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) { + reclaim = true; + tm = NULL; + } + if (tm) { + tcpm_check_stamp(tm, dst); + goto out_unlock; + } + + if (unlikely(reclaim)) { + struct tcp_metrics_block *oldest; + + oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); + for (tm = rcu_dereference(oldest->tcpm_next); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) + oldest = tm; + } + tm = oldest; + } else { + tm = kmalloc(sizeof(*tm), GFP_ATOMIC); + if (!tm) + goto out_unlock; + } + tm->tcpm_saddr = *saddr; + tm->tcpm_daddr = *daddr; + + tcpm_suck_dst(tm, dst, true); + + if (likely(!reclaim)) { + tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; + rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); + } + +out_unlock: + spin_unlock_bh(&tcp_metrics_lock); + return tm; +} + +static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) +{ + if (tm) + return tm; + if (depth > TCP_METRICS_RECLAIM_DEPTH) + return TCP_METRICS_RECLAIM_PTR; + return NULL; +} + +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, + struct net *net, unsigned int hash) +{ + struct tcp_metrics_block *tm; + int depth = 0; + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_saddr, saddr) && + addr_same(&tm->tcpm_daddr, daddr)) + break; + depth++; + } + return tcp_get_encode(tm, depth); +} + +static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, + struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr saddr, daddr; + unsigned int hash; + struct net *net; + + saddr.family = req->rsk_ops->family; + daddr.family = req->rsk_ops->family; + switch (daddr.family) { + case AF_INET: + saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; + daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; + hash = (__force unsigned int) daddr.addr.a4; + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; + *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); + break; +#endif + default: + return NULL; + } + + net = dev_net(dst->dev); + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) + break; + } + tcpm_check_stamp(tm, dst); + return tm; +} + +static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr saddr, daddr; + unsigned int hash; + struct net *net; + + if (tw->tw_family == AF_INET) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } +#if IS_ENABLED(CONFIG_IPV6) + else if (tw->tw_family == AF_INET6) { + if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; + hash = ipv6_addr_hash(&tw->tw_v6_daddr); + } + } +#endif + else + return NULL; + + net = twsk_net(tw); + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) + break; + } + return tm; +} + +static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, + struct dst_entry *dst, + bool create) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr saddr, daddr; + unsigned int hash; + struct net *net; + + if (sk->sk_family == AF_INET) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; + hash = ipv6_addr_hash(&sk->sk_v6_daddr); + } + } +#endif + else + return NULL; + + net = dev_net(dst->dev); + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + tm = __tcp_get_metrics(&saddr, &daddr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) + tm = NULL; + if (!tm && create) + tm = tcpm_new(dst, &saddr, &daddr, hash); + else + tcpm_check_stamp(tm, dst); + + return tm; +} + +/* Save metrics learned by this TCP session. This function is called + * only, when TCP finishes successfully i.e. when it enters TIME-WAIT + * or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_metrics_block *tm; + unsigned long rtt; + u32 val; + int m; + + if (sysctl_tcp_nometrics_save || !dst) + return; + + if (dst->flags & DST_HOST) + dst_confirm(dst); + + rcu_read_lock(); + if (icsk->icsk_backoff || !tp->srtt_us) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. Reset our + * results. + */ + tm = tcp_get_metrics(sk, dst, false); + if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) + tcp_metric_set(tm, TCP_METRIC_RTT, 0); + goto out_unlock; + } else + tm = tcp_get_metrics(sk, dst, true); + + if (!tm) + goto out_unlock; + + rtt = tcp_metric_get(tm, TCP_METRIC_RTT); + m = rtt - tp->srtt_us; + + /* If newly calculated rtt larger than stored one, store new + * one. Otherwise, use EWMA. Remember, rtt overestimation is + * always better than underestimation. + */ + if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { + if (m <= 0) + rtt = tp->srtt_us; + else + rtt -= (m >> 3); + tcp_metric_set(tm, TCP_METRIC_RTT, rtt); + } + + if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { + unsigned long var; + + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev_us) + m = tp->mdev_us; + + var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); + if (m >= var) + var = m; + else + var -= (var - m) >> 2; + + tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); + } + + if (tcp_in_initial_slowstart(tp)) { + /* Slow start still did not finish. */ + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val && (tp->snd_cwnd >> 1) > val) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + tp->snd_cwnd >> 1); + } + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + if (tp->snd_cwnd > val) + tcp_metric_set(tm, TCP_METRIC_CWND, + tp->snd_cwnd); + } + } else if (tp->snd_cwnd > tp->snd_ssthresh && + icsk->icsk_ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); + } + } else { + /* Else slow start did not finish, cwnd is non-sense, + * ssthresh may be also invalid. + */ + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + tcp_metric_set(tm, TCP_METRIC_CWND, + (val + tp->snd_ssthresh) >> 1); + } + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val && tp->snd_ssthresh > val) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + tp->snd_ssthresh); + } + if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); + if (val < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + tcp_metric_set(tm, TCP_METRIC_REORDERING, + tp->reordering); + } + } + tm->tcpm_stamp = jiffies; +out_unlock: + rcu_read_unlock(); +} + +/* Initialize metrics on socket. */ + +void tcp_init_metrics(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_metrics_block *tm; + u32 val, crtt = 0; /* cached RTT scaled by 8 */ + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (!tm) { + rcu_read_unlock(); + goto reset; + } + + if (tcp_metric_locked(tm, TCP_METRIC_CWND)) + tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); + + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val) { + tp->snd_ssthresh = val; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } else { + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + } + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); + if (val && tp->reordering != val) { + tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); + tp->reordering = val; + } + + crtt = tcp_metric_get(tm, TCP_METRIC_RTT); + rcu_read_unlock(); +reset: + /* The initial RTT measurement from the SYN/SYN-ACK is not ideal + * to seed the RTO for later data packets because SYN packets are + * small. Use the per-dst cached values to seed the RTO but keep + * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). + * Later the RTO will be updated immediately upon obtaining the first + * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only + * influences the first RTO but not later RTT estimation. + * + * But if RTT is not available from the SYN (due to retransmits or + * syn cookies) or the cache, force a conservative 3secs timeout. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal circumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + if (crtt > tp->srtt_us) { + /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ + crtt /= 8 * USEC_PER_MSEC; + inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); + } else if (tp->srtt_us == 0) { + /* RFC6298: 5.7 We've failed to get a valid RTT sample from + * 3WHS. This is most likely due to retransmission, + * including spurious one. Reset the RTO back to 3secs + * from the more aggressive 1sec to avoid more spurious + * retransmission. + */ + tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); + tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; + + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; + } + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +{ + struct tcp_metrics_block *tm; + bool ret; + + if (!dst) + return false; + + rcu_read_lock(); + tm = __tcp_get_metrics_req(req, dst); + if (paws_check) { + if (tm && + (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && + (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ret = false; + else + ret = true; + } else { + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) + ret = true; + else + ret = false; + } + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_peer_is_proven); + +void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; + tp->rx_opt.ts_recent = tm->tcpm_ts; + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); + +/* VJ's idea. Save last timestamp seen from this destination and hold + * it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter + * synchronized state. + */ +bool tcp_remember_stamp(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + bool ret = false; + + if (dst) { + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; + tm->tcpm_ts = tp->rx_opt.ts_recent; + } + ret = true; + } + rcu_read_unlock(); + } + return ret; +} + +bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ + struct tcp_metrics_block *tm; + bool ret = false; + + rcu_read_lock(); + tm = __tcp_get_metrics_tw(tw); + if (tm) { + const struct tcp_timewait_sock *tcptw; + struct sock *sk = (struct sock *) tw; + + tcptw = tcp_twsk(sk); + if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; + tm->tcpm_ts = tcptw->tw_ts_recent; + } + ret = true; + } + rcu_read_unlock(); + + return ret; +} + +static DEFINE_SEQLOCK(fastopen_seqlock); + +void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, + struct tcp_fastopen_cookie *cookie, + int *syn_loss, unsigned long *last_syn_loss) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); + if (tm) { + struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; + unsigned int seq; + + do { + seq = read_seqbegin(&fastopen_seqlock); + if (tfom->mss) + *mss = tfom->mss; + *cookie = tfom->cookie; + *syn_loss = tfom->syn_loss; + *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; + } while (read_seqretry(&fastopen_seqlock, seq)); + } + rcu_read_unlock(); +} + +void tcp_fastopen_cache_set(struct sock *sk, u16 mss, + struct tcp_fastopen_cookie *cookie, bool syn_lost) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_metrics_block *tm; + + if (!dst) + return; + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; + + write_seqlock_bh(&fastopen_seqlock); + if (mss) + tfom->mss = mss; + if (cookie && cookie->len > 0) + tfom->cookie = *cookie; + if (syn_lost) { + ++tfom->syn_loss; + tfom->last_syn_loss = jiffies; + } else + tfom->syn_loss = 0; + write_sequnlock_bh(&fastopen_seqlock); + } + rcu_read_unlock(); +} + +static struct genl_family tcp_metrics_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = TCP_METRICS_GENL_NAME, + .version = TCP_METRICS_GENL_VERSION, + .maxattr = TCP_METRICS_ATTR_MAX, + .netnsok = true, +}; + +static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { + [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, + [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr), }, + /* Following attributes are not received for GET/DEL, + * we keep them for reference + */ +#if 0 + [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, + [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, + [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, + [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, + [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, + [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, + [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, + [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, + .len = TCP_FASTOPEN_COOKIE_MAX, }, +#endif +}; + +/* Add attributes, caller cancels its header on failure */ +static int tcp_metrics_fill_info(struct sk_buff *msg, + struct tcp_metrics_block *tm) +{ + struct nlattr *nest; + int i; + + switch (tm->tcpm_daddr.family) { + case AF_INET: + if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, + tm->tcpm_daddr.addr.a4) < 0) + goto nla_put_failure; + if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, + tm->tcpm_saddr.addr.a4) < 0) + goto nla_put_failure; + break; + case AF_INET6: + if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, + tm->tcpm_daddr.addr.a6) < 0) + goto nla_put_failure; + if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, + tm->tcpm_saddr.addr.a6) < 0) + goto nla_put_failure; + break; + default: + return -EAFNOSUPPORT; + } + + if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, + jiffies - tm->tcpm_stamp) < 0) + goto nla_put_failure; + if (tm->tcpm_ts_stamp) { + if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, + (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) + goto nla_put_failure; + if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, + tm->tcpm_ts) < 0) + goto nla_put_failure; + } + + { + int n = 0; + + nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); + if (!nest) + goto nla_put_failure; + for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { + u32 val = tm->tcpm_vals[i]; + + if (!val) + continue; + if (i == TCP_METRIC_RTT) { + if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (i == TCP_METRIC_RTTVAR) { + if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (nla_put_u32(msg, i + 1, val) < 0) + goto nla_put_failure; + n++; + } + if (n) + nla_nest_end(msg, nest); + else + nla_nest_cancel(msg, nest); + } + + { + struct tcp_fastopen_metrics tfom_copy[1], *tfom; + unsigned int seq; + + do { + seq = read_seqbegin(&fastopen_seqlock); + tfom_copy[0] = tm->tcpm_fastopen; + } while (read_seqretry(&fastopen_seqlock, seq)); + + tfom = tfom_copy; + if (tfom->mss && + nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, + tfom->mss) < 0) + goto nla_put_failure; + if (tfom->syn_loss && + (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, + tfom->syn_loss) < 0 || + nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, + jiffies - tfom->last_syn_loss) < 0)) + goto nla_put_failure; + if (tfom->cookie.len > 0 && + nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, + tfom->cookie.len, tfom->cookie.val) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int tcp_metrics_dump_info(struct sk_buff *skb, + struct netlink_callback *cb, + struct tcp_metrics_block *tm) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &tcp_metrics_nl_family, NLM_F_MULTI, + TCP_METRICS_CMD_GET); + if (!hdr) + return -EMSGSIZE; + + if (tcp_metrics_fill_info(skb, tm) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int tcp_metrics_nl_dump(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + unsigned int row, s_row = cb->args[0]; + int s_col = cb->args[1], col = s_col; + + for (row = s_row; row < max_rows; row++, s_col = 0) { + struct tcp_metrics_block *tm; + struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; + + rcu_read_lock(); + for (col = 0, tm = rcu_dereference(hb->chain); tm; + tm = rcu_dereference(tm->tcpm_next), col++) { + if (col < s_col) + continue; + if (tcp_metrics_dump_info(skb, cb, tm) < 0) { + rcu_read_unlock(); + goto done; + } + } + rcu_read_unlock(); + } + +done: + cb->args[0] = row; + cb->args[1] = col; + return skb->len; +} + +static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional, int v4, int v6) +{ + struct nlattr *a; + + a = info->attrs[v4]; + if (a) { + addr->family = AF_INET; + addr->addr.a4 = nla_get_be32(a); + if (hash) + *hash = (__force unsigned int) addr->addr.a4; + return 0; + } + a = info->attrs[v6]; + if (a) { + if (nla_len(a) != sizeof(struct in6_addr)) + return -EINVAL; + addr->family = AF_INET6; + memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); + if (hash) + *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + return 0; + } + return optional ? 1 : -EAFNOSUPPORT; +} + +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional) +{ + return __parse_nl_addr(info, addr, hash, optional, + TCP_METRICS_ATTR_ADDR_IPV4, + TCP_METRICS_ATTR_ADDR_IPV6); +} + +static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) +{ + return __parse_nl_addr(info, addr, NULL, 0, + TCP_METRICS_ATTR_SADDR_IPV4, + TCP_METRICS_ATTR_SADDR_IPV6); +} + +static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr saddr, daddr; + unsigned int hash; + struct sk_buff *msg; + struct net *net = genl_info_net(info); + void *reply; + int ret; + bool src = true; + + ret = parse_nl_addr(info, &daddr, &hash, 0); + if (ret < 0) + return ret; + + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, + info->genlhdr->cmd); + if (!reply) + goto nla_put_failure; + + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + ret = -ESRCH; + rcu_read_lock(); + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + ret = tcp_metrics_fill_info(msg, tm); + break; + } + } + rcu_read_unlock(); + if (ret < 0) + goto out_free; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +nla_put_failure: + ret = -EMSGSIZE; + +out_free: + nlmsg_free(msg); + return ret; +} + +#define deref_locked_genl(p) \ + rcu_dereference_protected(p, lockdep_genl_is_held() && \ + lockdep_is_held(&tcp_metrics_lock)) + +#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) + +static int tcp_metrics_flush_all(struct net *net) +{ + unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; + struct tcp_metrics_block *tm; + unsigned int row; + + for (row = 0; row < max_rows; row++, hb++) { + spin_lock_bh(&tcp_metrics_lock); + tm = deref_locked_genl(hb->chain); + if (tm) + hb->chain = NULL; + spin_unlock_bh(&tcp_metrics_lock); + while (tm) { + struct tcp_metrics_block *next; + + next = deref_genl(tm->tcpm_next); + kfree_rcu(tm, rcu_head); + tm = next; + } + } + return 0; +} + +static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct tcpm_hash_bucket *hb; + struct tcp_metrics_block *tm; + struct tcp_metrics_block __rcu **pp; + struct inetpeer_addr saddr, daddr; + unsigned int hash; + struct net *net = genl_info_net(info); + int ret; + bool src = true, found = false; + + ret = parse_nl_addr(info, &daddr, &hash, 1); + if (ret < 0) + return ret; + if (ret > 0) + return tcp_metrics_flush_all(net); + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; + + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hb = net->ipv4.tcp_metrics_hash + hash; + pp = &hb->chain; + spin_lock_bh(&tcp_metrics_lock); + for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + *pp = tm->tcpm_next; + kfree_rcu(tm, rcu_head); + found = true; + } else { + pp = &tm->tcpm_next; + } + } + spin_unlock_bh(&tcp_metrics_lock); + if (!found) + return -ESRCH; + return 0; +} + +static const struct genl_ops tcp_metrics_nl_ops[] = { + { + .cmd = TCP_METRICS_CMD_GET, + .doit = tcp_metrics_nl_cmd_get, + .dumpit = tcp_metrics_nl_dump, + .policy = tcp_metrics_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = TCP_METRICS_CMD_DEL, + .doit = tcp_metrics_nl_cmd_del, + .policy = tcp_metrics_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static unsigned int tcpmhash_entries; +static int __init set_tcpmhash_entries(char *str) +{ + ssize_t ret; + + if (!str) + return 0; + + ret = kstrtouint(str, 0, &tcpmhash_entries); + if (ret) + return 0; + + return 1; +} +__setup("tcpmhash_entries=", set_tcpmhash_entries); + +static int __net_init tcp_net_metrics_init(struct net *net) +{ + size_t size; + unsigned int slots; + + slots = tcpmhash_entries; + if (!slots) { + if (totalram_pages >= 128 * 1024) + slots = 16 * 1024; + else + slots = 8 * 1024; + } + + net->ipv4.tcp_metrics_hash_log = order_base_2(slots); + size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; + + net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!net->ipv4.tcp_metrics_hash) + net->ipv4.tcp_metrics_hash = vzalloc(size); + + if (!net->ipv4.tcp_metrics_hash) + return -ENOMEM; + + return 0; +} + +static void __net_exit tcp_net_metrics_exit(struct net *net) +{ + unsigned int i; + + for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { + struct tcp_metrics_block *tm, *next; + + tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); + while (tm) { + next = rcu_dereference_protected(tm->tcpm_next, 1); + kfree(tm); + tm = next; + } + } + kvfree(net->ipv4.tcp_metrics_hash); +} + +static __net_initdata struct pernet_operations tcp_net_metrics_ops = { + .init = tcp_net_metrics_init, + .exit = tcp_net_metrics_exit, +}; + +void __init tcp_metrics_init(void) +{ + int ret; + + ret = register_pernet_subsys(&tcp_net_metrics_ops); + if (ret < 0) + goto cleanup; + ret = genl_register_family_with_ops(&tcp_metrics_nl_family, + tcp_metrics_nl_ops); + if (ret < 0) + goto cleanup_subsys; + return; + +cleanup_subsys: + unregister_pernet_subsys(&tcp_net_metrics_ops); + +cleanup: + return; +} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b85d9fe7d66..e68e0d4af6c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,56 +49,6 @@ struct inet_timewait_death_row tcp_death_row = { }; EXPORT_SYMBOL_GPL(tcp_death_row); -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -static bool tcp_remember_stamp(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct inet_peer *peer; - bool release_it; - - peer = icsk->icsk_af_ops->get_peer(sk, &release_it); - if (peer) { - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - peer->tcp_ts = tp->rx_opt.ts_recent; - } - if (release_it) - inet_putpeer(peer); - return true; - } - - return false; -} - -static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ - struct sock *sk = (struct sock *) tw; - struct inet_peer *peer; - - peer = twsk_getpeer(sk); - if (peer) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - peer->tcp_ts = tcptw->tw_ts_recent; - } - inet_putpeer(peer); - return true; - } - return false; -} - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -135,21 +85,23 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK + * + * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_options_received tmp_opt; - const u8 *hash_location; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { + tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); @@ -327,25 +279,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + struct inet_sock *inet = inet_sk(sk); - tw->tw_transparent = inet_sk(sk)->transparent; + tw->tw_transparent = inet->transparent; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + tcptw->tw_ts_offset = tp->tsoffset; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_timewait_sock *tw6; - tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); - tw6 = inet6_twsk((struct sock *)tw); - tw6->tw_v6_daddr = np->daddr; - tw6->tw_v6_rcv_saddr = np->rcv_saddr; + tw->tw_v6_daddr = sk->sk_v6_daddr; + tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; + tw->tw_flowlabel = np->flow_label >> 12; tw->tw_ipv6only = np->ipv6only; } #endif @@ -363,7 +315,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) key = tp->af_specific->md5_lookup(sk, sk); if (key != NULL) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL) + if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) BUG(); } } while (0); @@ -403,14 +355,44 @@ void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) { - tcp_free_md5sig_pool(); + + if (twsk->tw_md5_key) kfree_rcu(twsk->tw_md5_key, rcu); - } #endif } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, + struct sock *sk, struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + int mss = dst_metric_advmss(dst); + + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); + static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, struct request_sock *req) { @@ -432,32 +414,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - - /* TCP Cookie Transactions require space for the cookie pair, - * as it differs for each connection. There is no need to - * copy any s_data_payload stored at the original socket. - * Failure will prevent resuming the connection. - * - * Presumed copied, in order of appearance: - * cookie_in_always, cookie_out_never - */ - if (oldcvp != NULL) { - struct tcp_cookie_values *newcvp = - kzalloc(sizeof(*newtp->cookie_values), - GFP_ATOMIC); - - if (newcvp != NULL) { - kref_init(&newcvp->kref); - newcvp->cookie_desired = - oldcvp->cookie_desired; - newtp->cookie_values = newcvp; - } else { - /* Not Yet Implemented */ - newtp->cookie_values = NULL; - } - } /* Now setup tcp_sock */ newtp->pred_flags = 0; @@ -466,15 +422,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_nxt = treq->rcv_isn + 1; newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = - treq->snt_isn + 1 + tcp_s_data_size(oldtp); + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; tcp_prequeue_init(newtp); + INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); - newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -483,6 +439,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); + newtp->tlp_high_seq = 0; + newtp->lsndtime = treq->snt_synack; + newtp->total_retrans = req->num_retrans; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -491,10 +450,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - newtp->bytes_acked = 0; - - newtp->frto_counter = 0; - newtp->frto_highmark = 0; if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && !try_module_get(newicsk->icsk_ca_ops->owner)) @@ -502,9 +457,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); - skb_queue_head_init(&newtp->out_of_order_queue); - newtp->write_seq = newtp->pushed_seq = - treq->snt_isn + 1 + tcp_s_data_size(oldtp); + __skb_queue_head_init(&newtp->out_of_order_queue); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; @@ -545,6 +499,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + newtp->tsoffset = 0; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -554,6 +509,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -562,24 +519,32 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, EXPORT_SYMBOL(tcp_create_openreq_child); /* - * Process an incoming packet for SYN_RECV sockets represented - * as a request_sock. + * Process an incoming packet for SYN_RECV sockets represented as a + * request_sock. Normally sk is the listener socket but for TFO it + * points to the child socket. + * + * XXX (TFO) - The current impl contains a special check for ack + * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? + * + * We don't need to initialize tmp_opt.sack_ok as we don't use the results */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev) + struct request_sock **prev, + bool fastopen) { struct tcp_options_received tmp_opt; - const u8 *hash_location; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; + BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -587,7 +552,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -612,8 +577,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. + * + * Note that even if there is new data in the SYN packet + * they will be thrown away too. + * + * Reset timer after retransmitting SYNACK, similar to + * the idea of fast retransmit in recovery. */ - req->rsk_ops->rtx_syn_ack(sk, req, NULL); + if (!inet_rtx_syn_ack(sk, req)) + req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, + TCP_RTO_MAX) + jiffies; return NULL; } @@ -669,11 +642,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * - * Invalid ACK: reset will be sent by listening socket + * Invalid ACK: reset will be sent by listening socket. + * Note that the ACK validity check for a Fast Open socket is done + * elsewhere and is checked directly against the child socket rather + * than req because user data may have been sent out. */ - if ((flg & TCP_FLAG_ACK) && + if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != - tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) + tcp_rsk(req)->snt_isn + 1)) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which @@ -684,7 +660,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -695,7 +671,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) + if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) req->ts_recent = tmp_opt.rcv_tsval; if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { @@ -714,21 +690,26 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. + * + * XXX (TFO) - if we ever allow "data after SYN", the + * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; + /* For Fast Open no more processing is needed (sk is the + * child socket). + */ + if (fastopen) + return sk; + /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ - if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } - if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) - tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; - else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ - tcp_rsk(req)->snt_synack = 0; /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all @@ -753,11 +734,21 @@ listen_overflow: } embryonic_reset: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST)) { + /* Received a bad SYN pkt - for TFO We try not to reset + * the local connection unless it's really necessary to + * avoid becoming vulnerable to outside attack aiming at + * resetting legit local connections. + */ req->rsk_ops->send_reset(sk, skb); - - inet_csk_reqsk_queue_drop(sk, req, prev); + } else if (fastopen) { /* received a valid RST pkt */ + reqsk_fastopen_remove(sk, req, true); + tcp_reset(sk); + } + if (!fastopen) { + inet_csk_reqsk_queue_drop(sk, req, prev); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + } return NULL; } EXPORT_SYMBOL(tcp_check_req); @@ -766,6 +757,12 @@ EXPORT_SYMBOL(tcp_check_req); * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. + * + * For the vast majority of cases child->sk_state will be TCP_SYN_RECV + * when entering. But other states are possible due to a race condition + * where after __inet_lookup_established() fails but before the listener + * locked is obtained, other packets cause the same connection to + * be created. */ int tcp_child_process(struct sock *parent, struct sock *child, @@ -779,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c new file mode 100644 index 00000000000..55046ecd083 --- /dev/null +++ b/net/ipv4/tcp_offload.c @@ -0,0 +1,329 @@ +/* + * IPV4 GSO/GRO offload support + * Linux INET implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TCPv4 GSO/GRO support + */ + +#include <linux/skbuff.h> +#include <net/tcp.h> +#include <net/protocol.h> + +struct sk_buff *tcp_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int sum_truesize = 0; + struct tcphdr *th; + unsigned int thlen; + unsigned int seq; + __be32 delta; + unsigned int oldlen; + unsigned int mss; + struct sk_buff *gso_skb = skb; + __sum16 newcheck; + bool ooo_okay, copy_destructor; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = tcp_hdr(skb); + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = (u16)~skb->len; + __skb_pull(skb, thlen); + + mss = tcp_skb_mss(skb); + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP | + SKB_GSO_SIT | + SKB_GSO_MPLS | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + 0) || + !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + copy_destructor = gso_skb->destructor == tcp_wfree; + ooo_okay = gso_skb->ooo_okay; + /* All segments but the first should have ooo_okay cleared */ + skb->ooo_okay = 0; + + segs = skb_segment(skb, features); + if (IS_ERR(segs)) + goto out; + + /* Only first segment might have ooo_okay set */ + segs->ooo_okay = ooo_okay; + + delta = htonl(oldlen + (thlen + mss)); + + skb = segs; + th = tcp_hdr(skb); + seq = ntohl(th->seq); + + newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + + do { + th->fin = th->psh = 0; + th->check = newcheck; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + th->check = gso_make_checksum(skb, ~th->check); + + seq += mss; + if (copy_destructor) { + skb->destructor = gso_skb->destructor; + skb->sk = gso_skb->sk; + sum_truesize += skb->truesize; + } + skb = skb->next; + th = tcp_hdr(skb); + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + /* Following permits TCP Small Queues to work well with GSO : + * The callback to TCP stack will be called at the time last frag + * is freed at TX completion, and not right now when gso_skb + * is freed by GSO engine + */ + if (copy_destructor) { + swap(gso_skb->sk, skb->sk); + swap(gso_skb->destructor, skb->destructor); + sum_truesize += skb->truesize; + atomic_add(sum_truesize - gso_skb->truesize, + &skb->sk->sk_wmem_alloc); + } + + delta = htonl(oldlen + (skb_tail_pointer(skb) - + skb_transport_header(skb)) + + skb->data_len); + th->check = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + if (skb->ip_summed != CHECKSUM_PARTIAL) + th->check = gso_make_checksum(skb, ~th->check); +out: + return segs; +} + +struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct tcphdr *th; + struct tcphdr *th2; + unsigned int len; + unsigned int thlen; + __be32 flags; + unsigned int mss = 1; + unsigned int hlen; + unsigned int off; + int flush = 1; + int i; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } + + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + hlen = off + thlen; + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } + + skb_gro_pull(skb, thlen); + + len = skb_gro_len(skb); + flags = tcp_flag_word(th); + + for (; (p = *head); head = &p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + goto found; + } + + goto out_check_final; + +found: + /* Include the IP ID check below from the inner most IP hdr */ + flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; + flush |= (__force int)(flags & TCP_FLAG_CWR); + flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + flush |= (__force int)(th->ack_seq ^ th2->ack_seq); + for (i = sizeof(*th); i < thlen; i += 4) + flush |= *(u32 *)((u8 *)th + i) ^ + *(u32 *)((u8 *)th2 + i); + + mss = tcp_skb_mss(p); + + flush |= (len - 1) >= mss; + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); + + if (flush || skb_gro_receive(head, skb)) { + mss = 1; + goto out_check_final; + } + + p = *head; + th2 = tcp_hdr(p); + tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); + +out_check_final: + flush = len < mss; + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | + TCP_FLAG_RST | TCP_FLAG_SYN | + TCP_FLAG_FIN)); + + if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) + pp = head; + +out: + NAPI_GRO_CB(skb)->flush |= (flush != 0); + + return pp; +} + +int tcp_gro_complete(struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + + skb->csum_start = (unsigned char *)th - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (th->cwr) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + + return 0; +} +EXPORT_SYMBOL(tcp_gro_complete); + +static int tcp_v4_gso_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); + return 0; +} + +static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + /* Use the IP hdr immediately proceeding for this transport */ + const struct iphdr *iph = skb_gro_network_header(skb); + __wsum wsum; + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (NAPI_GRO_CB(skb)->flush) + goto skip_csum; + + wsum = NAPI_GRO_CB(skb)->csum; + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), + 0); + + /* fall through */ + + case CHECKSUM_COMPLETE: + if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, + wsum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + +skip_csum: + return tcp_gro_receive(head, skb); +} + +static int tcp4_gro_complete(struct sk_buff *skb, int thoff) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, + iph->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + + return tcp_gro_complete(skb); +} + +static const struct net_offload tcpv4_offload = { + .callbacks = { + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_gso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + }, +}; + +int __init tcpv4_offload_init(void) +{ + return inet_add_offload(&tcpv4_offload, IPPROTO_TCP); +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 803cbfe82fb..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -62,26 +65,30 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ -EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - /* Don't override Nagle indefinitely with F-RTO */ - if (tp->frto_counter == 2) - tp->frto_counter = 3; - tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || tp->early_retrans_delayed) + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { tcp_rearm_rto(sk); + } + + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, + tcp_skb_pcount(skb)); } /* SND.NXT, if window was not shrunk. @@ -159,6 +166,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -169,8 +177,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && + (!dst || !dst_metric(dst, RTAX_QUICKACK))) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ @@ -180,6 +189,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } + +u32 tcp_default_init_rwnd(u32 mss) +{ + /* Initial receive window should be twice of TCP_INIT_CWND to + * enable proper sending of new unsent data during fast recovery + * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a + * limit when mss is larger than 1460. + */ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > 1460) + init_rwnd = max((1460 * init_rwnd) / mss, 2U); + return init_rwnd; +} + /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -229,22 +253,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to a value enough for senders starting with - * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place - * a limit on the initial window when mss is larger than 1460. - */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = TCP_DEFAULT_INIT_RCVWND; - if (mss > 1460) - init_cwnd = - max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - /* when initializing use the value from init_rcv_wnd - * rather than the default from above - */ - if (init_rcv_wnd) - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - else - *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); + if (!init_rcv_wnd) /* Use default unless specified otherwise */ + init_rcv_wnd = tcp_default_init_rwnd(mss); + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); } /* Set the clamp no higher than max representable value */ @@ -260,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); @@ -272,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; @@ -289,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ - if (new_win == 0) + if (new_win == 0) { tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } return new_win; } @@ -309,7 +331,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } @@ -354,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + shinfo->gso_segs = 1; + shinfo->gso_size = 0; + shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -379,54 +403,25 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) -#define OPTION_COOKIE_EXTENSION (1 << 4) +#define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { - u8 options; /* bit field of OPTION_* */ + u16 options; /* bit field of OPTION_* */ + u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ - u16 mss; /* 0 to disable */ - __u32 tsval, tsecr; /* need to include OPTION_TS */ __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; -/* The sysctl int routines are generic, so check consistency here. - */ -static u8 tcp_cookie_size_check(u8 desired) -{ - int cookie_size; - - if (desired > 0) - /* previously specified */ - return desired; - - cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); - if (cookie_size <= 0) - /* no default specified */ - return 0; - - if (cookie_size <= TCP_COOKIE_MIN) - /* value too small, specify minimum */ - return TCP_COOKIE_MIN; - - if (cookie_size >= TCP_COOKIE_MAX) - /* value too large, specify maximum */ - return TCP_COOKIE_MAX; - - if (cookie_size & 1) - /* 8-bit multiple, illegal, fix it */ - cookie_size++; - - return (u8)cookie_size; -} - /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). @@ -437,29 +432,11 @@ static u8 tcp_cookie_size_check(u8 desired) static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { - u8 options = opts->options; /* mungable copy */ + u16 options = opts->options; /* mungable copy */ - /* Having both authentication and cookies for security is redundant, - * and there's certainly not enough room. Instead, the cookie-less - * extension variant is proposed. - * - * Consider the pessimal case with authentication. The options - * could look like: - * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 - */ if (unlikely(OPTION_MD5 & options)) { - if (unlikely(OPTION_COOKIE_EXTENSION & options)) { - *ptr++ = htonl((TCPOPT_COOKIE << 24) | - (TCPOLEN_COOKIE_BASE << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - } else { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - } - options &= ~OPTION_COOKIE_EXTENSION; + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; @@ -488,44 +465,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - /* Specification requires after timestamp, so do it now. - * - * Consider the pessimal case without authentication. The options - * could look like: - * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 - */ - if (unlikely(OPTION_COOKIE_EXTENSION & options)) { - __u8 *cookie_copy = opts->hash_location; - u8 cookie_size = opts->hash_size; - - /* 8-bit multiple handled in tcp_cookie_size_check() above, - * and elsewhere. - */ - if (0x2 & cookie_size) { - __u8 *p = (__u8 *)ptr; - - /* 16-bit multiple */ - *p++ = TCPOPT_COOKIE; - *p++ = TCPOLEN_COOKIE_BASE + cookie_size; - *p++ = *cookie_copy++; - *p++ = *cookie_copy++; - ptr++; - cookie_size -= 2; - } else { - /* 32-bit multiple */ - *ptr++ = htonl(((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_COOKIE << 8) | - TCPOLEN_COOKIE_BASE) + - cookie_size); - } - - if (cookie_size > 0) { - memcpy(ptr, cookie_copy, cookie_size); - ptr += (cookie_size / 4); - } - } - if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -559,6 +498,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, tp->rx_opt.dsack = 0; } + + if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { + struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + + *ptr++ = htonl((TCPOPT_EXP << 24) | + ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | + TCPOPT_FASTOPEN_MAGIC); + + memcpy(ptr, foc->val, foc->len); + if ((foc->len & 3) == 2) { + u8 *align = ((u8 *)ptr) + foc->len; + align[0] = align[1] = TCPOPT_NOP; + } + ptr += (foc->len + 3) >> 2; + } } /* Compute TCP options for SYN packets. This is not the final @@ -569,11 +523,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_cookie_values *cvp = tp->cookie_values; unsigned int remaining = MAX_TCP_OPTION_SPACE; - u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? - tcp_cookie_size_check(cvp->cookie_desired) : - 0; + struct tcp_fastopen_request *fastopen = tp->fastopen_req; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); @@ -599,7 +550,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -614,52 +565,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, remaining -= TCPOLEN_SACKPERM_ALIGNED; } - /* Note that timestamps are required by the specification. - * - * Odd numbers of bytes are prohibited by the specification, ensuring - * that the cookie is 16-bit aligned, and the resulting cookie pair is - * 32-bit aligned. - */ - if (*md5 == NULL && - (OPTION_TS & opts->options) && - cookie_size > 0) { - int need = TCPOLEN_COOKIE_BASE + cookie_size; - - if (0x2 & need) { - /* 32-bit multiple */ - need += 2; /* NOPs */ - - if (need > remaining) { - /* try shrinking cookie to fit */ - cookie_size -= 2; - need -= 4; - } - } - while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { - cookie_size -= 4; - need -= 4; - } - if (TCP_COOKIE_MIN <= cookie_size) { - opts->options |= OPTION_COOKIE_EXTENSION; - opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; - opts->hash_size = cookie_size; - - /* Remember for future incarnations. */ - cvp->cookie_desired = cookie_size; - - if (cvp->cookie_desired != cvp->cookie_pair_size) { - /* Currently use random bytes as a nonce, - * assuming these are completely unpredictable - * by hostile users of the same system. - */ - get_random_bytes(&cvp->cookie_pair[0], - cookie_size); - cvp->cookie_pair_size = cookie_size; - } - + if (fastopen && fastopen->cookie.len >= 0) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = &fastopen->cookie; remaining -= need; + tp->syn_fastopen = 1; } } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -669,13 +585,10 @@ static unsigned int tcp_synack_options(struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5, - struct tcp_extend_values *xvp) + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; - u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? - xvp->cookie_plus : - 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); @@ -714,29 +627,16 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - - /* Similar rationale to tcp_syn_options() applies here, too. - * If the <SYN> options fit, the same options should fit now! - */ - if (*md5 == NULL && - ireq->tstamp_ok && - cookie_plus > TCPOLEN_COOKIE_BASE) { - int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ - - if (0x2 & need) { - /* 32-bit multiple */ - need += 2; /* NOPs */ - } - if (need <= remaining) { - opts->options |= OPTION_COOKIE_EXTENSION; - opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; + if (foc != NULL && foc->len >= 0) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = foc; remaining -= need; - } else { - /* There's no error return, so flag it. */ - xvp->cookie_out_never = 1; /* true */ - opts->hash_size = 0; } } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -752,6 +652,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb unsigned int size = 0; unsigned int eff_sacks; + opts->options = 0; + #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (unlikely(*md5)) { @@ -764,7 +666,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when : 0; + opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -783,6 +685,172 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb return size; } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { + struct tasklet_struct tasklet; + struct list_head head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +static void tcp_tsq_handler(struct sock *sk) +{ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | + TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); +} +/* + * One tasklet per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transferring tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + LIST_HEAD(list); + unsigned long flags; + struct list_head *q, *n; + struct tcp_sock *tp; + struct sock *sk; + + local_irq_save(flags); + list_splice_init(&tsq->head, &list); + local_irq_restore(flags); + + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + tcp_tsq_handler(sk); + } else { + /* defer the work to tcp_release_cb() */ + set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + } + bh_unlock_sock(sk); + + clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } +} + +#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ + (1UL << TCP_WRITE_TIMER_DEFERRED) | \ + (1UL << TCP_DELACK_TIMER_DEFERRED) | \ + (1UL << TCP_MTU_REDUCED_DEFERRED)) +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nflags; + + /* perform an atomic operation only if at least one flag is set */ + do { + flags = tp->tsq_flags; + if (!(flags & TCP_DEFERRED_ALL)) + return; + nflags = flags & ~TCP_DEFERRED_ALL; + } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + + if (flags & (1UL << TCP_TSQ_DEFERRED)) + tcp_tsq_handler(sk); + + /* Here begins the tricky part : + * We are called from release_sock() with : + * 1) BH disabled + * 2) sk_lock.slock spinlock held + * 3) socket owned by us (sk->sk_lock.owned == 1) + * + * But following code is meant to be called from BH handlers, + * so we should keep BH disabled, but early release socket ownership + */ + sock_release_ownership(sk); + + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { + tcp_write_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { + tcp_delack_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + sk->sk_prot->mtu_reduced(sk); + __sock_put(sk); + } +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + + INIT_LIST_HEAD(&tsq->head); + tasklet_init(&tsq->tasklet, + tcp_tasklet_func, + (unsigned long)tsq); + } +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We can't xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { + unsigned long flags; + struct tsq_tasklet *tsq; + + /* Keep a ref on socket. + * This last ref will be released in tcp_tasklet_func() + */ + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = &__get_cpu_var(tsq_tasklet); + list_add(&tp->tsq_node, &tsq->head); + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + } else { + sock_wfree(skb); + } +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -809,19 +877,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); + if (clone_it) { + skb_mstamp_get(&skb->skb_mstamp); - if (likely(clone_it)) { if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -836,15 +902,21 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) { + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - skb->ooo_okay = 1; - } else - skb->ooo_okay = 0; + + /* if no packet is in qdisc/device queue, then allow XPS to select + * another queue. + */ + skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - skb_set_owner_w(skb, sk); + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = tcp_wfree; + atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); @@ -902,7 +974,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -932,18 +1004,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || !sk_can_gso(sk) || - skb->ip_summed == CHECKSUM_NONE) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Make sure we own this skb before messing gso_size/gso_segs */ + WARN_ON_ONCE(skb_cloned(skb)); + + if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + shinfo->gso_segs = 1; + shinfo->gso_size = 0; + shinfo->gso_type = 0; } else { - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); - skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type = sk->sk_gso_type; + shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + shinfo->gso_size = mss_now; + shinfo->gso_type = sk->sk_gso_type; } } @@ -998,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -1013,13 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, gfp); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ @@ -1092,35 +1166,36 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, */ static void __pskb_trim_head(struct sk_buff *skb, int len) { + struct skb_shared_info *shinfo; int i, k, eat; eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; } eat = len; k = 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + shinfo = skb_shinfo(skb); + for (i = 0; i < shinfo->nr_frags; i++) { + int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + shinfo->frags[k] = shinfo->frags[i]; if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + shinfo->frags[k].page_offset += eat; + skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } - skb_shinfo(skb)->nr_frags = k; + shinfo->nr_frags = k; skb_reset_tail_pointer(skb); skb->data_len -= len; @@ -1130,7 +1205,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; __pskb_trim_head(skb, len); @@ -1150,8 +1225,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1180,13 +1255,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - return mss_now; } +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { @@ -1301,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out >= tp->snd_cwnd) { + /* Track the maximum number of outstanding packets in each + * window, and remember whether we were cwnd-limited then. + */ + if (!before(tp->snd_una, tp->max_packets_seq) || + tp->packets_out > tp->max_packets_out) { + tp->max_packets_out = tp->packets_out; + tp->max_packets_seq = tp->snd_nxt; + tp->is_cwnd_limited = is_cwnd_limited; + } + + if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1321,36 +1431,72 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) +{ + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, + int nonagle) +{ + return partial && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; - return needed - needed % mss_now; + partial = needed % mss_now; + /* If last segment is not a full MSS, check if Nagle rules allow us + * to include this last segment in this skb. + * Otherwise, we'll split the skb at last MSS boundary + */ + if (tcp_nagle_check(partial != 0, tp, nonagle)) + return needed - partial; + + return needed; } /* Can at least one segment of SKB be sent right now, according to the @@ -1390,28 +1536,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int mss_now, int nonagle) -{ - return skb->len < mss_now && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1428,14 +1552,11 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (nonagle & TCP_NAGLE_PUSH) return true; - /* Don't use the nagle rule for urgent data (or for the final FIN). - * Nagle can be ignored during F-RTO too (see RFC4138). - */ - if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; @@ -1504,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now); + return tcp_fragment(sk, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) @@ -1547,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + bool *is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1577,7 +1699,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + tp->xmit_size_goal_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1604,8 +1727,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); + + if (cong_win < send_win && cong_win < skb->len) + *is_cwnd_limited = true; return true; @@ -1753,6 +1882,9 @@ static int tcp_mtu_probe(struct sock *sk) * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * + * Send at most one packet when push_one > 0. Temporarily ignore + * cwnd limit to force at most one packet out when push_one == 2. + * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ @@ -1764,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + bool is_cwnd_limited = false; sent_pkts = 0; @@ -1783,9 +1916,18 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) + goto repair; /* Skip network transmission */ + cwnd_quota = tcp_cwnd_test(tp, skb); - if (!cwnd_quota) - break; + if (!cwnd_quota) { + is_cwnd_limited = true; + if (push_one == 2) + /* Force out a loss probe pkt. */ + cwnd_quota = 1; + else + break; + } if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; @@ -1796,14 +1938,42 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + break; + } + + /* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * This allows for : + * - better RTT estimation and ACK scheduling + * - faster recovery + * - high rates + * Alas, some drivers / subsystems require a fair amount + * of queued bytes to ensure line rate. + * One example is wifi aggregation (802.11 AMPDU) + */ + limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, + sk->sk_pacing_rate >> 10); + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + */ + smp_mb__after_atomic(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs), + nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1814,6 +1984,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; +repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ @@ -1825,14 +1996,153 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (push_one) break; } - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) - tp->prr_out += sent_pkts; if (likely(sent_pkts)) { - tcp_cwnd_validate(sk); + if (tcp_in_cwnd_reduction(sk)) + tp->prr_out += sent_pkts; + + /* Send one loss probe per tail loss episode. */ + if (push_one != 2) + tcp_schedule_loss_probe(sk); + tcp_cwnd_validate(sk, is_cwnd_limited); + return false; + } + return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); +} + +bool tcp_schedule_loss_probe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout, tlp_time_stamp, rto_time_stamp; + u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) + return false; + /* No consecutive loss probes. */ + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { + tcp_rearm_rto(sk); + return false; + } + /* Don't do any loss probe on a Fast Open connection before 3WHS + * finishes. + */ + if (sk->sk_state == TCP_SYN_RECV) + return false; + + /* TLP is only scheduled when next timer event is RTO. */ + if (icsk->icsk_pending != ICSK_TIME_RETRANS) + return false; + + /* Schedule a loss probe in 2*RTT for SACK capable connections + * in Open state, that are either limited by cwnd or application. + */ + if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; + + if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && + tcp_send_head(sk)) + return false; + + /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + * for delayed ack when there's one outstanding packet. + */ + timeout = rtt << 1; + if (tp->packets_out == 1) + timeout = max_t(u32, timeout, + (rtt + (rtt >> 1) + TCP_DELACK_MAX)); + timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + + /* If RTO is shorter, just schedule TLP in its place. */ + tlp_time_stamp = tcp_time_stamp + timeout; + rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; + if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { + s32 delta = rto_time_stamp - tcp_time_stamp; + if (delta > 0) + timeout = delta; + } + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, + TCP_RTO_MAX); + return true; +} + +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } + return false; +} + +/* When probe timeout (PTO) fires, send a new segment if one exists, else + * retransmit the last segment. + */ +void tcp_send_loss_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int pcount; + int mss = tcp_current_mss(sk); + int err = -1; + + if (tcp_send_head(sk) != NULL) { + err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + goto rearm_timer; } - return !tp->packets_out && tcp_send_head(sk); + + /* At most one outstanding TLP retransmission. */ + if (tp->tlp_high_seq) + goto rearm_timer; + + /* Retransmit last segment. */ + skb = tcp_write_queue_tail(sk); + if (WARN_ON(!skb)) + goto rearm_timer; + + if (skb_still_in_host_queue(sk, skb)) + goto rearm_timer; + + pcount = tcp_skb_pcount(skb); + if (WARN_ON(!pcount)) + goto rearm_timer; + + if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + GFP_ATOMIC))) + goto rearm_timer; + skb = tcp_write_queue_tail(sk); + } + + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + + err = __tcp_retransmit_skb(sk, skb); + + /* Record snd_nxt for loss detection. */ + if (likely(!err)) + tp->tlp_high_seq = tp->snd_nxt; + +rearm_timer: + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); + + if (likely(!err)) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBES); } /* Push out any pending frames which were held back due to @@ -1849,7 +2159,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, if (unlikely(sk->sk_state == TCP_CLOSE)) return; - if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_atomic(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1929,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk) */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); - int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int allowed_space = tcp_full_space(sk); + int full_space = min_t(int, tp->window_clamp, allowed_space); int window; if (mss > full_space) @@ -1942,7 +2254,19 @@ u32 __tcp_select_window(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); - if (free_space < mss) + /* free_space might become our new window, make sure we don't + * increase it due to wscale. + */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + /* if free space is less than mss estimate, or is below 1/16th + * of the maximum allowed, try to move to zero-window, else + * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and + * new incoming data is dropped due to memory limits. + * With large window, mss test triggers way too late in order + * to announce zero window in time before rmem limit kicks in. + */ + if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } @@ -2092,7 +2416,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -2111,6 +2435,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; + if (skb_still_in_host_queue(sk, skb)) + return -EBUSY; + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); @@ -2133,12 +2460,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -EAGAIN; if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss)) + if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { int oldpcount = tcp_skb_pcount(skb); if (unlikely(oldpcount > 1)) { + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; tcp_init_tso_segs(sk, skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } @@ -2146,28 +2475,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); - /* Some Solaris stacks overoptimize and ignore the FIN on a - * retransmit when old data is attached. So strip it off - * since it is cheap to do so and saves bytes on the network. - */ - if (skb->len > 0 && - (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { - if (!pskb_trim(skb, 0)) { - /* Reuse, even though it does some unnecessary work */ - tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, - TCP_SKB_CB(skb)->tcp_flags); - skb->ip_summed = CHECKSUM_NONE; - } - } - /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - /* make sure skb->data is aligned on arches that require it */ - if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { + /* make sure skb->data is aligned on arches that require it + * and check if ack-trimming & collapsing extended the headroom + * beyond what csum_start can cover. + */ + if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || + skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : @@ -2176,12 +2494,23 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - if (err == 0) { + if (likely(!err)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; /* Update global TCP statistics. */ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans++; + } + return err; +} + +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err = __tcp_retransmit_skb(sk, skb); + if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); @@ -2196,13 +2525,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans += tcp_skb_pcount(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } else if (err != -EBUSY) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } + + if (tp->undo_retrans < 0) + tp->undo_retrans = 0; + tp->undo_retrans += tcp_skb_pcount(skb); return err; } @@ -2323,13 +2656,12 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + if (tcp_retransmit_skb(sk, skb)) return; - } + NET_INC_STATS_BH(sock_net(sk), mib_idx); - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) + if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == tcp_write_queue_head(sk)) @@ -2442,59 +2774,43 @@ int tcp_send_synack(struct sock *sk) return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -/* Prepare a SYN-ACK. */ +/** + * tcp_make_synack - Prepare a SYN-ACK. + * sk: listener socket + * dst: dst entry attached to the SYNACK + * req: request_sock pointer + * + * Allocate one skb and build a SYNACK packet. + * @dst is consumed : Caller should not use it again. + */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp) + struct tcp_fastopen_cookie *foc) { struct tcp_out_options opts; - struct tcp_extend_values *xvp = tcp_xv(rvp); struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - const struct tcp_cookie_values *cvp = tp->cookie_values; struct tcphdr *th; struct sk_buff *skb; struct tcp_md5sig_key *md5; int tcp_header_size; int mss; - int s_data_desired = 0; - if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) - s_data_desired = cvp->s_data_desired; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); - if (skb == NULL) + skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + if (unlikely(!skb)) { + dst_release(dst); return NULL; - + } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); - skb_dst_set(skb, dst_clone(dst)); + skb_dst_set(skb, dst); + security_skb_owned_by(skb, sk); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - - /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); - - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; - } - memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) @@ -2502,9 +2818,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, else #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5, xvp) - + sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2514,56 +2829,23 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = ireq->loc_port; - th->dest = ireq->rmt_port; + th->source = htons(ireq->ir_num); + th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPHDR_SYN | TCPHDR_ACK); - if (OPTION_COOKIE_EXTENSION & opts.options) { - if (s_data_desired) { - u8 *buf = skb_put(skb, s_data_desired); - - /* copy data directly from the listening socket. */ - memcpy(buf, cvp->s_data_payload, s_data_desired); - TCP_SKB_CB(skb)->end_seq += s_data_desired; - } - - if (opts.hash_size > 0) { - __u32 workspace[SHA_WORKSPACE_WORDS]; - u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; - u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; - - /* Secret recipe depends on the Timestamp, (future) - * Sequence and Acknowledgment Numbers, Initiator - * Cookie, and others handled by IP variant caller. - */ - *tail-- ^= opts.tsval; - *tail-- ^= tcp_rsk(req)->rcv_isn + 1; - *tail-- ^= TCP_SKB_CB(skb)->seq + 1; - - /* recommended */ - *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); - *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ - - sha_transform((__u32 *)&xvp->cookie_bakery[0], - (char *)mess, - &workspace[0]); - opts.hash_location = - (__u8 *)&xvp->cookie_bakery[0]; - } - } - th->seq = htonl(TCP_SKB_CB(skb)->seq); - th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); + /* XXX data is queued and acked as is. No buffer/window check */ + th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2578,7 +2860,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2637,6 +2919,8 @@ void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; + else + tp->rcv_tstamp = tcp_time_stamp; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; @@ -2645,6 +2929,120 @@ void tcp_connect_init(struct sock *sk) tcp_clear_retrans(tp); } +static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + tcb->end_seq += skb->len; + skb_header_release(skb); + __tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + tp->write_seq = tcb->end_seq; + tp->packets_out += tcp_skb_pcount(skb); +} + +/* Build and send a SYN with data and (cached) Fast Open cookie. However, + * queue a data-only packet after the regular SYN, such that regular SYNs + * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges + * only the SYN sequence, the data are retransmitted in the first ACK. + * If cookie is not cached or other error occurs, falls back to send a + * regular SYN with Fast Open cookie request option. + */ +static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_fastopen_request *fo = tp->fastopen_req; + int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; + struct sk_buff *syn_data = NULL, *data; + unsigned long last_syn_loss = 0; + + tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ + tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, + &syn_loss, &last_syn_loss); + /* Recurring FO SYN losses: revert to regular handshake temporarily */ + if (syn_loss > 1 && + time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { + fo->cookie.len = -1; + goto fallback; + } + + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) + fo->cookie.len = -1; + else if (fo->cookie.len <= 0) + goto fallback; + + /* MSS for SYN-data is based on cached MSS and bounded by PMTU and + * user-MSS. Reserve maximum option space for middleboxes that add + * private TCP options. The cost is reduced data space in SYN :( + */ + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + MAX_TCP_OPTION_SPACE; + + space = min_t(size_t, space, fo->size); + + /* limit to order-0 allocations */ + space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + + syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, + sk->sk_allocation); + if (syn_data == NULL) + goto fallback; + + for (i = 0; i < iovlen && syn_data->len < space; ++i) { + struct iovec *iov = &fo->data->msg_iov[i]; + unsigned char __user *from = iov->iov_base; + int len = iov->iov_len; + + if (syn_data->len + len > space) + len = space - syn_data->len; + else if (i + 1 == iovlen) + /* No more data pending in inet_wait_for_connect() */ + fo->data = NULL; + + if (skb_add_data(syn_data, from, len)) + goto fallback; + } + + /* Queue a data-only packet after the regular SYN for retransmission */ + data = pskb_copy(syn_data, sk->sk_allocation); + if (data == NULL) + goto fallback; + TCP_SKB_CB(data)->seq++; + TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; + TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); + tcp_connect_queue_skb(sk, data); + fo->copied = data->len; + + /* syn_data is about to be sent, we need to take current time stamps + * for the packets that are in write queue : SYN packet and DATA + */ + skb_mstamp_get(&syn->skb_mstamp); + data->skb_mstamp = syn->skb_mstamp; + + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + tp->syn_data = (fo->copied > 0); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); + goto done; + } + syn_data = NULL; + +fallback: + /* Send a regular SYN with Fast Open cookie request option */ + if (fo->cookie.len > 0) + fo->cookie.len = 0; + err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); + if (err) + tp->syn_fastopen = 0; + kfree_skb(syn_data); +done: + fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ + return err; +} + /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { @@ -2654,6 +3052,11 @@ int tcp_connect(struct sock *sk) tcp_connect_init(sk); + if (unlikely(tp->repair)) { + tcp_finish_connect(sk, NULL); + return 0; + } + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; @@ -2662,17 +3065,13 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); - /* Send it off. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->retrans_stamp = TCP_SKB_CB(buff)->when; - skb_header_release(buff); - __tcp_add_write_queue_tail(sk, buff); - sk->sk_wmem_queued += buff->truesize; - sk_mem_charge(sk, buff->truesize); - tp->packets_out += tcp_skb_pcount(buff); - err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + /* Send off SYN; include data in Fast Open. */ + err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; @@ -2714,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk) * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ - if (tp->srtt) { - int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); + if (tp->srtt_us) { + int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), + TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -2759,7 +3159,7 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; @@ -2774,7 +3174,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); + tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } /* This routine sends a packet with an out of date sequence @@ -2794,7 +3194,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (skb == NULL) return -1; @@ -2813,7 +3213,6 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; tcp_xmit_probe_skb(sk, 0); } } @@ -2844,7 +3243,7 @@ int tcp_write_wakeup(struct sock *sk) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) + if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 4526fe68e60..3b66610d415 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -38,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.1"); -static int port __read_mostly = 0; +static int port __read_mostly; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); @@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); module_param(bufsize, uint, 0); +static unsigned int fwmark __read_mostly; +MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); +module_param(fwmark, uint, 0); + static int full __read_mostly; MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); module_param(full, int, 0); @@ -54,12 +58,16 @@ static const char procname[] = "tcpprobe"; struct tcp_log { ktime_t tstamp; - __be32 saddr, daddr; - __be16 sport, dport; + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } src, dst; u16 length; u32 snd_nxt; u32 snd_una; u32 snd_wnd; + u32 rcv_wnd; u32 snd_cwnd; u32 ssthresh; u32 srtt; @@ -86,19 +94,29 @@ static inline int tcp_probe_avail(void) return bufsize - tcp_probe_used() - 1; } +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + + /* * Hook inserted to be called before each receive packet. * Note: arguments must match tcp_rcv_established()! */ -static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned int len) +static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); - /* Only update if port matches */ - if ((port == 0 || ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port) && + /* Only update if port or skb mark matches */ + if (((port == 0 && fwmark == 0) || + ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port || + (fwmark > 0 && skb->mark == fwmark)) && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { spin_lock(&tcp_probe.lock); @@ -107,17 +125,36 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_log *p = tcp_probe.log + tcp_probe.head; p->tstamp = ktime_get(); - p->saddr = inet->inet_saddr; - p->sport = inet->inet_sport; - p->daddr = inet->inet_daddr; - p->dport = inet->inet_dport; + switch (sk->sk_family) { + case AF_INET: + tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); + tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); + break; + case AF_INET6: + memset(&p->src.v6, 0, sizeof(p->src.v6)); + memset(&p->dst.v6, 0, sizeof(p->dst.v6)); +#if IS_ENABLED(CONFIG_IPV6) + p->src.v6.sin6_family = AF_INET6; + p->src.v6.sin6_port = inet->inet_sport; + p->src.v6.sin6_addr = inet6_sk(sk)->saddr; + + p->dst.v6.sin6_family = AF_INET6; + p->dst.v6.sin6_port = inet->inet_dport; + p->dst.v6.sin6_addr = sk->sk_v6_daddr; +#endif + break; + default: + BUG(); + } + p->length = skb->len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; p->snd_cwnd = tp->snd_cwnd; p->snd_wnd = tp->snd_wnd; + p->rcv_wnd = tp->rcv_wnd; p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt >> 3; + p->srtt = tp->srtt_us >> 3; tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } @@ -128,7 +165,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, } jprobe_return(); - return 0; } static struct jprobe tcp_jprobe = { @@ -157,13 +193,11 @@ static int tcpprobe_sprint(char *tbuf, int n) = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); return scnprintf(tbuf, n, - "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", + "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec, - &p->saddr, ntohs(p->sport), - &p->daddr, ntohs(p->dport), - p->length, p->snd_nxt, p->snd_una, - p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); + &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, + p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } static ssize_t tcpprobe_read(struct file *file, char __user *buf, @@ -176,7 +210,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, return -EINVAL; while (cnt < len) { - char tbuf[164]; + char tbuf[256]; int width; /* Wait for data in buffer */ @@ -223,6 +257,13 @@ static __init int tcpprobe_init(void) { int ret = -ENOMEM; + /* Warning: if the function signature of tcp_rcv_established, + * has been changed, you also have to change the signature of + * jtcp_rcv_established, otherwise you end up right here! + */ + BUILD_BUG_ON(__same_type(tcp_rcv_established, + jtcp_rcv_established) == 0); + init_waitqueue_head(&tcp_probe.wait); spin_lock_init(&tcp_probe.lock); @@ -234,17 +275,18 @@ static __init int tcpprobe_init(void) if (!tcp_probe.log) goto err0; - if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops)) + if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) goto err0; ret = register_jprobe(&tcp_jprobe); if (ret) goto err1; - pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); + pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", + port, fwmark, bufsize); return 0; err1: - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); err0: kfree(tcp_probe.log); return ret; @@ -253,7 +295,7 @@ module_init(tcpprobe_init); static __exit void tcpprobe_exit(void) { - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); unregister_jprobe(&tcp_jprobe); kfree(tcp_probe.log); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 8ce55b8aaec..8250949b885 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,15 +15,15 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); else tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); } @@ -38,7 +38,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "scalable", diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e911e6c523e..286227abed1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; -static void tcp_write_timer(unsigned long); -static void tcp_delack_timer(unsigned long); -static void tcp_keepalive_timer (unsigned long data); - -void tcp_init_xmit_timers(struct sock *sk) -{ - inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, - &tcp_keepalive_timer); -} -EXPORT_SYMBOL(tcp_init_xmit_timers); - static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -167,12 +156,19 @@ static bool retransmits_timed_out(struct sock *sk, static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int retry_until; bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) + if (icsk->icsk_retransmits) { dst_negative_advice(sk); + if (tp->syn_fastopen || tp->syn_data) + tcp_fastopen_cache_set(sk, 0, NULL, true); + if (tp->syn_data) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; } else { @@ -205,21 +201,11 @@ static int tcp_write_timeout(struct sock *sk) return 0; } -static void tcp_delack_timer(unsigned long data) +void tcp_delack_timer_handler(struct sock *sk) { - struct sock *sk = (struct sock *)data; struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); - goto out_unlock; - } - sk_mem_reclaim_partial(sk); if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) @@ -260,7 +246,22 @@ static void tcp_delack_timer(unsigned long data) out: if (sk_under_memory_pressure(sk)) sk_mem_reclaim(sk); -out_unlock: +} + +static void tcp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_delack_timer_handler(sk); + } else { + inet_csk(sk)->icsk_ack.blocked = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + /* deleguate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); + } bh_unlock_sock(sk); sock_put(sk); } @@ -311,6 +312,35 @@ static void tcp_probe_timer(struct sock *sk) } /* + * Timer for Fast Open socket to retransmit SYNACK. Note that the + * sk here is the child socket, not the parent (listener) socket. + */ +static void tcp_fastopen_synack_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int max_retries = icsk->icsk_syn_retries ? : + sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + struct request_sock *req; + + req = tcp_sk(sk)->fastopen_rsk; + req->rsk_ops->syn_ack_timeout(sk, req); + + if (req->num_timeout >= max_retries) { + tcp_write_err(sk); + return; + } + /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error + * returned from rtx_syn_ack() to make it more persistent like + * regular retransmit because if the child socket has been accepted + * it's not good to give up too easily. + */ + inet_rtx_syn_ack(sk, req); + req->num_timeout++; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); +} + +/* * The TCP retransmit timer. */ @@ -319,16 +349,22 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - if (tp->early_retrans_delayed) { - tcp_resume_early_retransmit(sk); + if (tp->fastopen_rsk) { + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && + sk->sk_state != TCP_FIN_WAIT1); + tcp_fastopen_synack_timer(sk); + /* Before we receive ACK to our SYN-ACK don't retransmit + * anything else (e.g., data or FIN segments). + */ return; } - if (!tp->packets_out) goto out; WARN_ON(tcp_write_queue_empty(sk)); + tp->tlp_high_seq = 0; + if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits @@ -345,9 +381,8 @@ void tcp_retransmit_timer(struct sock *sk) } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &np->daddr, + &sk->sk_v6_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt); } @@ -387,11 +422,7 @@ void tcp_retransmit_timer(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), mib_idx); } - if (tcp_use_frto(sk)) { - tcp_enter_frto(sk); - } else { - tcp_enter_loss(sk, 0); - } + tcp_enter_loss(sk, 0); if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, @@ -450,19 +481,11 @@ out_reset_timer: out:; } -static void tcp_write_timer(unsigned long data) +void tcp_write_timer_handler(struct sock *sk) { - struct sock *sk = (struct sock *)data; struct inet_connection_sock *icsk = inet_csk(sk); int event; - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); - goto out_unlock; - } - if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; @@ -472,20 +495,40 @@ static void tcp_write_timer(unsigned long data) } event = icsk->icsk_pending; - icsk->icsk_pending = 0; switch (event) { + case ICSK_TIME_EARLY_RETRANS: + tcp_resume_early_retransmit(sk); + break; + case ICSK_TIME_LOSS_PROBE: + tcp_send_loss_probe(sk); + break; case ICSK_TIME_RETRANS: + icsk->icsk_pending = 0; tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: + icsk->icsk_pending = 0; tcp_probe_timer(sk); break; } out: sk_mem_reclaim(sk); -out_unlock: +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_write_timer_handler(sk); + } else { + /* deleguate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); + } bh_unlock_sock(sk); sock_put(sk); } @@ -602,3 +645,10 @@ out: bh_unlock_sock(sk); sock_put(sk); } + +void tcp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); +} +EXPORT_SYMBOL(tcp_init_xmit_timers); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 80fa2bfd7ed..9a5e05f27f4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,13 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) return min(tp->snd_ssthresh, tp->snd_cwnd-1); } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { - tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -194,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; @@ -243,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) } else if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ - tcp_slow_start(tp); + tcp_slow_start(tp, acked); } else { /* Congestion avoidance. */ @@ -283,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) } /* Use normal slow start */ else if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); } @@ -305,11 +305,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 6c0eea2f824..0531b99d863 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -15,10 +15,10 @@ struct vegas { u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ }; -extern void tcp_vegas_init(struct sock *sk); -extern void tcp_vegas_state(struct sock *sk, u8 ca_state); -extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); -extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +void tcp_vegas_init(struct sock *sk); +void tcp_vegas_state(struct sock *sk, u8 ca_state); +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); #endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index ac43cd747bc..27b9825753d 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,18 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { - tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } /* limited by applications */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* We do the Veno calculations only if we got enough rtt samples */ @@ -133,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u64 target_cwnd; u32 rtt; @@ -152,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ - tcp_slow_start(tp); + tcp_slow_start(tp, acked); } else { /* Congestion avoidance. */ if (veno->diff < beta) { @@ -202,7 +202,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 1b91bf48e27..b94a04ae2ed 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_FRTO: + case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; @@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 05c3b6f0e8e..599b79b8eac 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -3,7 +3,7 @@ * YeAH TCP * * For further details look at: - * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf * */ #include <linux/mm.h> @@ -15,13 +15,13 @@ #include "tcp_vegas.h" -#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck -#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt -#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss -#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion -#define TCP_YEAH_PHY 8 //lin maximum delta from base -#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss -#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count +#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */ +#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */ +#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */ +#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */ +#define TCP_YEAH_PHY 8 /* maximum delta from base */ +#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */ +#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */ #define TCP_SCALABLE_AI_CNT 100U @@ -69,16 +69,16 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + tcp_slow_start(tp, acked); else if (!yeah->doing_reno_now) { /* Scalable */ @@ -213,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { if (yeah->doing_reno_now < TCP_YEAH_RHO) { reduction = yeah->lastQ; - reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); - reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); } else reduction = max(tp->snd_cwnd>>1, 2U); @@ -226,11 +226,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { } static struct tcp_congestion_ops tcp_yeah __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, .cong_avoid = tcp_yeah_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eaca73644e7..7d5a8661df7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -103,11 +103,14 @@ #include <linux/seq_file.h> #include <net/net_namespace.h> #include <net/icmp.h> +#include <net/inet_hashtables.h> #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> +#include <trace/events/skb.h> +#include <net/busy_poll.h> #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -138,6 +141,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, { struct sock *sk2; struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); sk_nulls_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && @@ -146,6 +150,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!sk2->sk_reuseport || !sk->sk_reuseport || + !uid_eq(uid, sock_i_uid(sk2))) && (*saddr_comp)(sk, sk2)) { if (bitmap) __set_bit(udp_sk(sk2)->udp_port_hash >> log, @@ -168,6 +174,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, { struct sock *sk2; struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); @@ -178,6 +185,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!sk2->sk_reuseport || !sk->sk_reuseport || + !uid_eq(uid, sock_i_uid(sk2))) && (*saddr_comp)(sk, sk2)) { res = 1; break; @@ -211,10 +220,10 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned short first, last; DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = net_random(); + rand = prandom_u32(); first = (((u64)rand * remaining) >> 32) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -237,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); @@ -336,26 +345,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, !ipv6_only_sock(sk)) { struct inet_sock *inet = inet_sk(sk); - score = (sk->sk_family == PF_INET ? 1 : 0); + score = (sk->sk_family == PF_INET ? 2 : 1); if (inet->inet_rcv_saddr) { if (inet->inet_rcv_saddr != daddr) return -1; - score += 2; + score += 4; } if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; - score += 2; + score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; - score += 2; + score += 4; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) return -1; - score += 2; + score += 4; } } return score; @@ -364,7 +373,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, /* * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) */ -#define SCORE2_MAX (1 + 2 + 2 + 2) static inline int compute_score2(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) @@ -379,26 +387,38 @@ static inline int compute_score2(struct sock *sk, struct net *net, if (inet->inet_num != hnum) return -1; - score = (sk->sk_family == PF_INET ? 1 : 0); + score = (sk->sk_family == PF_INET ? 2 : 1); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; - score += 2; + score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; - score += 2; + score += 4; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) return -1; - score += 2; + score += 4; } } return score; } +static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) +{ + static u32 udp_ehash_secret __read_mostly; + + net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); + + return __inet_ehashfn(laddr, lport, faddr, fport, + udp_ehash_secret + net_hash_mix(net)); +} + /* called with read_rcu_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, @@ -408,19 +428,29 @@ static struct sock *udp4_lib_lookup2(struct net *net, { struct sock *sk, *result; struct hlist_nulls_node *node; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; begin: result = NULL; - badness = -1; + badness = 0; udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { result = sk; badness = score; - if (score == SCORE2_MAX) - goto exact_match; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = udp_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -430,9 +460,7 @@ begin: */ if (get_nulls_value(node) != slot2) goto begin; - if (result) { -exact_match: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -456,7 +484,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; rcu_read_lock(); if (hslot->count > 10) { @@ -485,13 +514,24 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, } begin: result = NULL; - badness = -1; + badness = 0; sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = udp_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -520,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { - struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) - return sk; - else - return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - udptable); + return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -538,6 +574,26 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, } EXPORT_SYMBOL_GPL(udp4_lib_lookup); +static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif, unsigned short hnum) +{ + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(sk) || + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + return false; + if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) + return false; + return true; +} + static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, @@ -548,20 +604,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, unsigned short hnum = ntohs(loc_port); sk_nulls_for_each_from(s, node) { - struct inet_sock *inet = inet_sk(s); - - if (!net_eq(sock_net(s), net) || - udp_sk(s)->udp_port_hash != hnum || - (inet->inet_daddr && inet->inet_daddr != rmt_addr) || - (inet->inet_dport != rmt_port && inet->inet_dport) || - (inet->inet_rcv_saddr && - inet->inet_rcv_saddr != loc_addr) || - ipv6_only_sock(s) || - (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) - continue; - if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) - continue; - goto found; + if (__udp_is_mcast_sock(net, s, + loc_port, loc_addr, + rmt_port, rmt_addr, + dif, hnum)) + goto found; } s = NULL; found: @@ -615,6 +662,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + ipv4_sk_update_pmtu(skb, sk, info); if (inet->pmtudisc != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; @@ -628,6 +676,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) err = icmp_err_convert[code].errno; } break; + case ICMP_REDIRECT: + ipv4_sk_redirect(skb, sk); + goto out; } /* @@ -673,16 +724,15 @@ EXPORT_SYMBOL(udp_flush_pending_frames); * @src: source IP address * @dst: destination IP address */ -static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); - struct sk_buff *frags = skb_shinfo(skb)->frag_list; int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; - if (!frags) { + if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ @@ -691,15 +741,17 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { + struct sk_buff *frags; + /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - do { + skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; - } while ((frags = frags->next)); + } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; @@ -709,6 +761,44 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = CSUM_MANGLED_0; } } +EXPORT_SYMBOL_GPL(udp4_hwcsum); + +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v4_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp_set_csum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { @@ -733,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; @@ -753,7 +843,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) uh->check = CSUM_MANGLED_0; send: - err = ip_send_skb(skb); + err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), @@ -769,7 +859,7 @@ send: /* * Push out all pending data as one UDP datagram. Socket is locked. */ -static int udp_push_pending_frames(struct sock *sk) +int udp_push_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -788,6 +878,7 @@ out: up->pending = 0; return err; } +EXPORT_SYMBOL(udp_push_pending_frames); int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) @@ -822,6 +913,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; @@ -847,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -872,11 +965,12 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(sk, &ipc.tx_flags); - if (err) - return err; + + sock_tx_timestamp(sk, &ipc.tx_flags); + if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, + sk->sk_family == AF_INET6); if (err) return err; if (ipc.opt) @@ -905,7 +999,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, faddr = ipc.opt->opt.faddr; connected = 0; } - tos = RT_TOS(inet->tos); + tos = get_rttos(&ipc, inet); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -931,7 +1025,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk), faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); @@ -940,7 +1034,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -966,7 +1060,7 @@ back_from_confirm: sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); - if (skb && !IS_ERR(skb)) + if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4); goto out; } @@ -1039,6 +1133,9 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, struct udp_sock *up = udp_sk(sk); int ret; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + if (!up->pending) { struct msghdr msg = { .msg_flags = flags|MSG_MORE }; @@ -1101,6 +1198,8 @@ static unsigned int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); @@ -1166,7 +1265,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int peeked, off = 0; @@ -1174,14 +1273,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int is_udplite = IS_UDPLITE(sk); bool slow; - /* - * Check any passed addresses - */ - if (addr_len) - *addr_len = sizeof(*sin); - if (flags & MSG_ERRQUEUE) - return ip_recv_error(sk, msg, len); + return ip_recv_error(sk, msg, len, addr_len); try_again: skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), @@ -1219,8 +1312,15 @@ try_again: goto csum_copy_err; } - if (err) + if (unlikely(err)) { + trace_kfree_skb(skb, udp_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } goto out_free; + } if (!peeked) UDP_INC_STATS_USER(sock_net(sk), @@ -1234,6 +1334,7 @@ try_again: sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + *addr_len = sizeof(*sin); } if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); @@ -1249,8 +1350,10 @@ out: csum_copy_err: slow = lock_sock_fast(sk); - if (!skb_kill_datagram(sk, skb, flags)) + if (!skb_kill_datagram(sk, skb, flags)) { + UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + } unlock_sock_fast(sk, slow); if (noblock) @@ -1359,8 +1462,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; - if (inet_sk(sk)->inet_daddr) + if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); + } rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { @@ -1428,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), @@ -1476,15 +1585,18 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) - goto drop; + goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; + } rc = 0; - ipv4_pktinfo_prepare(skb); + ipv4_pktinfo_prepare(sk, skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); @@ -1496,6 +1608,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return rc; +csum_error: + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); @@ -1531,6 +1645,18 @@ static void flush_stack(struct sock **stack, unsigned int count, kfree_skb(skb1); } +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ + struct dst_entry *old; + + dst_hold(dst); + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); +} + /* * Multicasts and broadcasts go to each listener. * @@ -1591,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { - const struct iphdr *iph; int err; UDP_SKB_CB(skb)->partial_cov = 0; @@ -1603,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return err; } - iph = ip_hdr(skb); - if (uh->check == 0) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - if (!skb_csum_unnecessary(skb)) - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, proto, 0); - /* Probably, we should checksum udp header (it should be in cache - * in any case) and data in tiny packets (< rx copybreak). - */ - - return 0; + return skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* @@ -1659,14 +1770,34 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); + int ret; - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); + + ret = udp_queue_rcv_skb(sk, skb); + sock_put(sk); + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; + } else { + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); + + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + } if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret; + + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1712,12 +1843,149 @@ csum_error: proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); + UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } +/* We can only early demux multicast if there is a single matching socket. + * If more than one socket found returns NULL + */ +static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(loc_port); + unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + struct udp_hslot *hslot = &udp_table.hash[slot]; + + /* Do not bother scanning a too big list */ + if (hslot->count > 10) + return NULL; + + rcu_read_lock(); +begin: + count = 0; + result = NULL; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, + loc_port, loc_addr, + rmt_port, rmt_addr, + dif, hnum)) { + result = sk; + ++count; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + + if (result) { + if (count != 1 || + unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(!__udp_is_mcast_sock(net, result, + loc_port, loc_addr, + rmt_port, rmt_addr, + dif, hnum))) { + sock_put(result); + result = NULL; + } + } + rcu_read_unlock(); + return result; +} + +/* For unicast we should only early demux connected sockets or we can + * break forwarding setups. The chains here can be long so only check + * if the first socket is an exact match and if not move on. + */ +static struct sock *__udp4_lib_demux_lookup(struct net *net, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(loc_port); + unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int slot2 = hash2 & udp_table.mask; + struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; + INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); + const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + + rcu_read_lock(); + result = NULL; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, + rmt_addr, loc_addr, ports, dif)) + result = sk; + /* Only check first socket in chain */ + break; + } + + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(!INET_MATCH(sk, net, acookie, + rmt_addr, loc_addr, + ports, dif))) { + sock_put(result); + result = NULL; + } + } + rcu_read_unlock(); + return result; +} + +void udp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct udphdr *uh; + struct sock *sk; + struct dst_entry *dst; + int dif = skb->dev->ifindex; + + /* validate the packet */ + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) + return; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) + sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, + uh->source, iph->saddr, dif); + else if (skb->pkt_type == PACKET_HOST) + sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, + uh->source, iph->saddr, dif); + else + return; + + if (!sk) + return; + + skb->sk = sk; + skb->destructor = sock_edemux; + dst = sk->sk_rx_dst; + + if (dst) + dst = dst_check(dst, 0); + if (dst) + skb_dst_set_noref(skb, dst); +} + int udp_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); @@ -1725,9 +1993,16 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); + if (static_key_false(&udp_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } } /* @@ -1738,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); - int val; + int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); @@ -1748,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (get_user(val, (int __user *)optval)) return -EFAULT; + valbool = val ? 1 : 0; + switch (optname) { case UDP_CORK: if (val != 0) { @@ -1777,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } break; + case UDP_NO_CHECK6_TX: + up->no_check6_tx = valbool; + break; + + case UDP_NO_CHECK6_RX: + up->no_check6_rx = valbool; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -1859,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = up->encap_type; break; + case UDP_NO_CHECK6_TX: + val = up->no_check6_tx; + break; + + case UDP_NO_CHECK6_RX: + val = up->no_check6_rx; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: @@ -1916,6 +2209,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; + sock_rps_record_flow(sk); + /* Check for false positives due to checksum errors */ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk)) @@ -1940,6 +2235,7 @@ struct proto udp_prot = { .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, .backlog_rcv = __udp_queue_rcv_skb, + .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, @@ -2048,7 +2344,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) int udp_seq_open(struct inode *inode, struct file *file) { - struct udp_seq_afinfo *afinfo = PDE(inode)->data; + struct udp_seq_afinfo *afinfo = PDE_DATA(inode); struct udp_iter_state *s; int err; @@ -2084,13 +2380,13 @@ EXPORT_SYMBOL(udp_proc_register); void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) { - proc_net_remove(net, afinfo->name); + remove_proc_entry(afinfo->name, net->proc_net); } EXPORT_SYMBOL(udp_proc_unregister); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, - int bucket, int *len) + int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -2099,29 +2395,30 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops), len); + atomic_read(&sp->sk_drops)); } int udp4_seq_show(struct seq_file *seq, void *v) { + seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; - int len; - udp4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len, ""); + udp4_format_sock(v, seq, state->bucket); } + seq_pad(seq, '\n'); return 0; } @@ -2230,65 +2527,78 @@ void __init udp_init(void) sysctl_udp_wmem_min = SK_MEM_QUANTUM; } -int udp4_ufo_send_check(struct sk_buff *skb) +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) { - const struct iphdr *iph; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(*uh))) - return -EINVAL; + struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; + int mac_len = skb->mac_len; + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + __be16 protocol = skb->protocol; + netdev_features_t enc_features; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; + + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) + goto out; - iph = ip_hdr(skb); - uh = udp_hdr(skb); + skb->encapsulation = 0; + __skb_pull(skb, tnl_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = htons(ETH_P_TEB); + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); + goto out; + } - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - return 0; -} + outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; + skb = segs; + do { + struct udphdr *uh; + int len; -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - int offset; - __wsum csum; + skb_reset_inner_headers(skb); + skb->encapsulation = 1; - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; + skb->mac_len = mac_len; - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; + skb_push(skb, outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; + uh = udp_hdr(skb); + uh->len = htons(len); - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || - !(type & (SKB_GSO_UDP)))) - goto out; + if (need_csum) { + __be32 delta = htonl(oldlen + len); - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); - segs = NULL; - goto out; - } + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } - /* Do software UFO. Complete and fill in the UDP checksum as HW cannot - * do checksum of UDP packets sent as multiple IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - /* Fragment the skb. IP headers of the fragments are updated in - * inet_gso_segment() - */ - segs = skb_segment(skb, features); + skb->protocol = protocol; + } while ((skb = skb->next)); out: return segs; } - diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index a7f86a3cd50..7927db0a927 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -24,7 +24,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, if (!inet_diag_bc_sk(bc, sk)) return 0; - return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid, + return inet_sk_diag_fill(sk, NULL, skb, req, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -34,15 +36,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, int err = -EINVAL; struct sock *sk; struct sk_buff *rep; + struct net *net = sock_net(in_skb->sk); if (req->sdiag_family == AF_INET) - sk = __udp4_lib_lookup(&init_net, + sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, tbl); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) - sk = __udp6_lib_lookup(&init_net, + sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, @@ -61,21 +64,22 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out; err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - 64)), GFP_KERNEL); + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + 64, + GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, req, - NETLINK_CB(in_skb).pid, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -90,6 +94,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin struct inet_diag_req_v2 *r, struct nlattr *bc) { int num, s_num, slot, s_slot; + struct net *net = sock_net(skb->sk); s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -106,6 +111,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin sk_nulls_for_each(sk, node, &hslot->head) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 5a681e298b9..f3c27899f62 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -5,30 +5,30 @@ #include <net/protocol.h> #include <net/inet_common.h> -extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int ); -extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); +void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); -extern int udp_v4_get_port(struct sock *sk, unsigned short snum); +int udp_v4_get_port(struct sock *sk, unsigned short snum); -extern int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -extern int udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #ifdef CONFIG_COMPAT -extern int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -extern int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +int compat_udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +int compat_udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #endif -extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len); -extern int udp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags); -extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -extern void udp_destroy_sock(struct sock *sk); +int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, + int flags); +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS -extern int udp4_seq_show(struct seq_file *seq, void *v); +int udp4_seq_show(struct seq_file *seq, void *v); #endif #endif /* _UDP4_IMPL_H */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c new file mode 100644 index 00000000000..546d2d439dd --- /dev/null +++ b/net/ipv4/udp_offload.c @@ -0,0 +1,250 @@ +/* + * IPV4 GSO/GRO offload support + * Linux INET implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * UDPv4 GSO support + */ + +#include <linux/skbuff.h> +#include <net/udp.h> +#include <net/protocol.h> + +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; + +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + +struct udp_offload_priv { + struct udp_offload *offload; + struct rcu_head rcu; + struct udp_offload_priv __rcu *next; +}; + +static int udp4_ufo_send_check(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + return -EINVAL; + + if (likely(!skb->encapsulation)) { + const struct iphdr *iph; + struct udphdr *uh; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + + return 0; +} + +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + int offset; + __wsum csum; + + if (skb->encapsulation && + (skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { + segs = skb_udp_tunnel_segment(skb, features); + goto out; + } + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_IPIP | + SKB_GSO_GRE | SKB_GSO_GRE_CSUM | + SKB_GSO_MPLS) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: + return segs; +} + +int udp_add_offload(struct udp_offload *uo) +{ + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); + + if (!new_offload) + return -ENOMEM; + + new_offload->offload = uo; + + spin_lock(&udp_offload_lock); + new_offload->next = udp_offload_base; + rcu_assign_pointer(udp_offload_base, new_offload); + spin_unlock(&udp_offload_lock); + + return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ + struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); + kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ + struct udp_offload_priv __rcu **head = &udp_offload_base; + struct udp_offload_priv *uo_priv; + + spin_lock(&udp_offload_lock); + + uo_priv = udp_deref_protected(*head); + for (; uo_priv != NULL; + uo_priv = udp_deref_protected(*head)) { + if (uo_priv->offload == uo) { + rcu_assign_pointer(*head, + udp_deref_protected(uo_priv->next)); + goto unlock; + } + head = &uo_priv->next; + } + pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); +unlock: + spin_unlock(&udp_offload_lock); + if (uo_priv != NULL) + call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct udp_offload_priv *uo_priv; + struct sk_buff *p, **pp = NULL; + struct udphdr *uh, *uh2; + unsigned int hlen, off; + int flush = 1; + + if (NAPI_GRO_CB(skb)->udp_mark || + (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + goto out; + + /* mark that this skb passed once through the udp gro layer */ + NAPI_GRO_CB(skb)->udp_mark = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + uh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!uh)) + goto out; + } + + rcu_read_lock(); + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_receive) + goto unflush; + } + goto out_unlock; + +unflush: + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = (struct udphdr *)(p->data + off); + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct udp_offload_priv *uo_priv; + __be16 newlen = htons(skb->len - nhoff); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + int err = -ENOSYS; + + uh->len = newlen; + + rcu_read_lock(); + + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_complete) + break; + } + + if (uo_priv != NULL) + err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + + rcu_read_unlock(); + return err; +} + +static const struct net_offload udpv4_offload = { + .callbacks = { + .gso_send_check = udp4_ufo_send_check, + .gso_segment = udp4_ufo_fragment, + .gro_receive = udp_gro_receive, + .gro_complete = udp_gro_complete, + }, +}; + +int __init udpv4_offload_init(void) +{ + return inet_add_offload(&udpv4_offload, IPPROTO_UDP); +} diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2c46acd4cc3..3b3efbda48e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216d..aac6197b7a7 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -37,15 +37,6 @@ drop: return NET_RX_DROP; } -int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - return xfrm_input(skb, nexthdr, spi, encap_type); -} -EXPORT_SYMBOL(xfrm4_rcv_encap); - int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct iphdr *iph = ip_hdr(skb); @@ -132,7 +123,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto drop; /* Now we can update and verify the packet length... */ diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e3db3f91511..71acd0014f2 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); skb_set_network_header(skb, -x->props.header_len - - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); if (x->sel.family != AF_INET6) skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ed4bf11ef9f..91771a7c802 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -44,8 +44,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, + /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + top_iph->tos = 0; + else + top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + top_iph->tos = INET_ECN_encapsulate(top_iph->tos, XFRM_MODE_SKB_CB(skb)->tos); flags = x->props.flags; @@ -54,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(top_iph, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + ip_select_ident(skb, NULL); return 0; } @@ -74,8 +78,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 327a617d594..d5f6bd9a210 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -21,20 +21,17 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; - struct dst_entry *dst; if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) goto out; - dst = skb_dst(skb); - mtu = dst_mtu(dst); + mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu) { if (skb->sk) - ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, - inet_sk(skb->sk)->inet_dport, mtu); + xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -65,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - - skb->protocol = htons(ETH_P_IP); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; return x->outer_mode->output2(x, skb); } @@ -76,26 +70,42 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER - if (!skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + skb->protocol = htons(ETH_P_IP); +#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IP); return xfrm_output(skb); } -int xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER + if (!x) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + + return x->outer_mode->afinfo->output_finish(skb); +} +int xfrm4_output(struct sock *sk, struct sk_buff *skb) +{ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, dst->dev, - x->outer_mode->afinfo->output_finish, + NULL, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } + +void xfrm4_local_error(struct sk_buff *skb, u32 mtu) +{ + struct iphdr *hdr; + + hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); + ip_local_error(skb->sk, EMSGSIZE, hdr->daddr, + inet_sk(skb->sk)->inet_dport, mtu); +} diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0d3426cb5c4..6156f68a1e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -79,30 +79,21 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; - xdst->u.rt.rt_key_dst = fl4->daddr; - xdst->u.rt.rt_key_src = fl4->saddr; - xdst->u.rt.rt_key_tos = fl4->flowi4_tos; - xdst->u.rt.rt_route_iif = fl4->flowi4_iif; xdst->u.rt.rt_iif = fl4->flowi4_iif; - xdst->u.rt.rt_oif = fl4->flowi4_oif; - xdst->u.rt.rt_mark = fl4->flowi4_mark; xdst->u.dst.dev = dev; dev_hold(dev); - xdst->u.rt.peer = rt->peer; - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ + xdst->u.rt.rt_is_input = rt->rt_is_input; xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_src = rt->rt_src; - xdst->u.rt.rt_dst = rt->rt_dst; xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; + xdst->u.rt.rt_pmtu = rt->rt_pmtu; + INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; } @@ -113,9 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; + fl4->flowi4_oif = reverse ? skb->skb_iif : oif; if (!ip_is_fragment(iph)) { switch (iph->protocol) { @@ -198,12 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } -static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, sk, skb, mtu); +} + +static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) @@ -212,9 +218,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) dst_destroy_metrics_generic(dst); - if (likely(xdst->u.rt.peer)) - inet_putpeer(xdst->u.rt.peer); - xfrm_dst_destroy(xdst); } @@ -232,11 +235,12 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 1024, + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { @@ -263,43 +267,67 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static struct ctl_table_header *sysctl_hdr; -#endif - -static void __init xfrm4_policy_init(void) +static int __net_init xfrm4_net_init(struct net *net) { - xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = xfrm4_policy_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; + } + + hdr = register_net_sysctl(net, "net/ipv4", table); + if (!hdr) + goto err_reg; + + net->ipv4.xfrm4_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; } -static void __exit xfrm4_policy_fini(void) +static void __net_exit xfrm4_net_exit(struct net *net) { -#ifdef CONFIG_SYSCTL - if (sysctl_hdr) - unregister_net_sysctl_table(sysctl_hdr); + struct ctl_table *table; + + if (net->ipv4.xfrm4_hdr == NULL) + return; + + table = net->ipv4.xfrm4_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.xfrm4_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations __net_initdata xfrm4_net_ops = { + .init = xfrm4_net_init, + .exit = xfrm4_net_exit, +}; #endif - xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); + +static void __init xfrm4_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); } -void __init xfrm4_init(int rt_max_size) +void __init xfrm4_init(void) { - /* - * Select a default value for the gc_thresh based on the main route - * table hash size. It seems to me the worst case scenario is when - * we have ipsec operating in transport mode, in which we create a - * dst_entry per socket. The xfrm gc algorithm starts trying to remove - * entries at gc_thresh, and prevents new allocations as 2*gc_thresh - * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. - * That will let us store an ipsec connection per route table entry, - * and start cleaning when were 1/2 full - */ - xfrm4_dst_ops.gc_thresh = rt_max_size/2; dst_entries_init(&xfrm4_dst_ops); xfrm4_state_init(); xfrm4_policy_init(); + xfrm4_protocol_init(); #ifdef CONFIG_SYSCTL - sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4", - xfrm4_policy_table); + register_pernet_subsys(&xfrm4_net_ops); #endif } diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 00000000000..a2ce0101eaa --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c @@ -0,0 +1,301 @@ +/* xfrm4_protocol.c - Generic xfrm protocol multiplexer. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/tunnel4.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_protocol_mutex); + +static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_handlers; + case IPPROTO_AH: + return &ah4_handlers; + case IPPROTO_COMP: + return &ipcomp4_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_cb); + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + if (!head) + goto out; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +static int xfrm4_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(esp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(esp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ah4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret;; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ah4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static const struct net_protocol esp4_protocol = { + .handler = xfrm4_esp_rcv, + .err_handler = xfrm4_esp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ah4_protocol = { + .handler = xfrm4_ah_rcv, + .err_handler = xfrm4_ah_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_ipcomp_rcv, + .err_handler = xfrm4_ipcomp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static struct xfrm_input_afinfo xfrm4_input_afinfo = { + .family = AF_INET, + .owner = THIS_MODULE, + .callback = xfrm4_rcv_cb, +}; + +static inline const struct net_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_protocol; + case IPPROTO_AH: + return &ah4_protocol; + case IPPROTO_COMP: + return &ipcomp4_protocol; + } + + return NULL; +} + +int xfrm4_protocol_register(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm4_protocol_mutex); + + if (add_netproto) { + if (inet_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_register); + +int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) { + if (inet_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm4_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_deregister); + +void __init xfrm4_protocol_init(void) +{ + xfrm_input_register_afinfo(&xfrm4_input_afinfo); +} +EXPORT_SYMBOL(xfrm4_protocol_init); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 9258e751bab..542074c00c7 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -16,7 +16,7 @@ static int xfrm4_init_flags(struct xfrm_state *x) { - if (ipv4_config.no_pmtu_disc) + if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) x->props.flags |= XFRM_STATE_NOPMTUDISC; return 0; } @@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, + .local_error = xfrm4_local_error, }; void __init xfrm4_state_init(void) diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 05a5df2febc..06347dbd32c 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -63,7 +63,7 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, - .priority = 2, + .priority = 3, }; #if IS_ENABLED(CONFIG_IPV6) |
