diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/devinet.c | 17 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 25 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 80 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 8 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 67 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 528 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 3 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 15 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_MASQUERADE.c | 2 | ||||
-rw-r--r-- | net/ipv4/ping.c | 2 | ||||
-rw-r--r-- | net/ipv4/proc.c | 7 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 24 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 15 | ||||
-rw-r--r-- | net/ipv4/tcp_fastopen.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 156 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_probe.c | 67 | ||||
-rw-r--r-- | net/ipv4/udp.c | 5 |
23 files changed, 418 insertions, 667 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 34ca6d5a3a4..a1b5bcbd04a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = { [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, + [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, }, }; @@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, + [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, }, }; @@ -1126,10 +1130,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) if (len < (int) sizeof(ifr)) break; memset(&ifr, 0, sizeof(struct ifreq)); - if (ifa->ifa_label) - strcpy(ifr.ifr_name, ifa->ifa_label); - else - strcpy(ifr.ifr_name, dev->name); + strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = @@ -2097,11 +2098,15 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), + DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, + "force_igmp_version"), + DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, + "igmpv2_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, + "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), - DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, - "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 26aa65d1fce..523be38e37d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -101,6 +101,30 @@ errout: return err; } +static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct fib_result *result = (struct fib_result *) arg->result; + struct net_device *dev = result->fi->fib_dev; + + /* do not accept result if the route does + * not meet the required prefix length + */ + if (result->prefixlen <= rule->suppress_prefixlen) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: + if (!(arg->flags & FIB_LOOKUP_NOREF)) + fib_info_put(result->fi); + return true; +} static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -267,6 +291,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, + .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, .delete = fib4_rule_delete, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cd71190d296..d6c0e64ec97 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -88,6 +88,7 @@ #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> +#include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/arp.h> @@ -113,7 +114,8 @@ #define IGMP_V1_Router_Present_Timeout (400*HZ) #define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) +#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) #define IGMP_Query_Response_Interval (10*HZ) #define IGMP_Unsolicited_Report_Count 2 @@ -138,6 +140,29 @@ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) +static int unsolicited_report_interval(struct in_device *in_dev) +{ + int interval_ms, interval_jiffies; + + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) + interval_ms = IN_DEV_CONF_GET( + in_dev, + IGMPV2_UNSOLICITED_REPORT_INTERVAL); + else /* v3 */ + interval_ms = IN_DEV_CONF_GET( + in_dev, + IGMPV3_UNSOLICITED_REPORT_INTERVAL); + + interval_jiffies = msecs_to_jiffies(interval_ms); + + /* _timer functions can't handle a delay of 0 jiffies so ensure + * we always return a positive value. + */ + if (interval_jiffies <= 0) + interval_jiffies = 1; + return interval_jiffies; +} + static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); static void igmpv3_clear_delrec(struct in_device *in_dev); @@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) if (size < 256) return NULL; } + skb->priority = TC_PRIO_CONTROL; igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, @@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ip_rt_put(rt); return -1; } + skb->priority = TC_PRIO_CONTROL; skb_dst_set(skb, &rt->dst); @@ -719,7 +746,8 @@ static void igmp_ifc_timer_expire(unsigned long data) igmpv3_send_cr(in_dev); if (in_dev->mr_ifc_count) { in_dev->mr_ifc_count--; - igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); + igmp_ifc_start_timer(in_dev, + unsolicited_report_interval(in_dev)); } __in_dev_put(in_dev); } @@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data) if (im->unsolicit_count) { im->unsolicit_count--; - igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + igmp_start_timer(im, unsolicited_report_interval(in_dev)); } im->reporter = 1; spin_unlock(&im->lock); @@ -1323,16 +1351,17 @@ out: EXPORT_SYMBOL(ip_mc_inc_group); /* - * Resend IGMP JOIN report; used for bonding. - * Called with rcu_read_lock() + * Resend IGMP JOIN report; used by netdev notifier. */ -void ip_mc_rejoin_groups(struct in_device *in_dev) +static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; - for_each_pmc_rcu(in_dev, im) { + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; @@ -1349,7 +1378,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev) } #endif } -EXPORT_SYMBOL(ip_mc_rejoin_groups); /* * A socket has left a multicast group on device dev @@ -2735,8 +2763,42 @@ static struct pernet_operations igmp_net_ops = { .exit = igmp_net_exit, }; +static int igmp_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct in_device *in_dev; + + switch (event) { + case NETDEV_RESEND_IGMP: + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) + ip_mc_rejoin_groups(in_dev); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block igmp_notifier = { + .notifier_call = igmp_netdev_event, +}; + int __init igmp_mc_proc_init(void) { - return register_pernet_subsys(&igmp_net_ops); + int err; + + err = register_pernet_subsys(&igmp_net_ops); + if (err) + return err; + err = register_netdevice_notifier(&igmp_notifier); + if (err) + goto reg_notif_fail; + return 0; + +reg_notif_fail: + unregister_pernet_subsys(&igmp_net_ops); + return err; } #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8d6939eeb49..d7aea4c5b94 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net) static void __net_exit ipgre_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { @@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net) static void __net_exit ipgre_tap_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 15e3e683ade..054a3e97d82 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -141,6 +141,7 @@ #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> +#include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> @@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; + BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); + BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); + BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); + IP_ADD_STATS_BH(dev_net(dev), + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ca1cb2d5f6e..830de3f4e29 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_tunnel(dev_net(dev), &fl4, + rt = ip_route_output_tunnel(tunnel->net, &fl4, tunnel->parms.iph.protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, @@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; @@ -454,15 +454,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); - if (tunnel->net != dev_net(tunnel->dev)) - skb_scrub_packet(skb); - if (tunnel->dev->type == ARPHRD_ETHER) { skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; } + + if (!net_eq(tunnel->net, dev_net(tunnel->dev))) + skb_scrub_packet(skb); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -613,7 +614,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tunnel->net != dev_net(dev)) + if (!net_eq(tunnel->net, dev_net(dev))) skb_scrub_packet(skb); if (tunnel->err_count > 0) { @@ -653,7 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - err = iptunnel_xmit(dev_net(dev), rt, skb, + err = iptunnel_xmit(tunnel->net, rt, skb, fl4.saddr, fl4.daddr, protocol, ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -820,11 +821,10 @@ static void ip_tunnel_dev_free(struct net_device *dev) void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) { - struct net *net = dev_net(dev); struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_net *itn; - itn = net_generic(net, tunnel->ip_tnl_net_id); + itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { ip_tunnel_del(netdev_priv(dev)); @@ -838,56 +838,68 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); struct ip_tunnel_parm parms; + unsigned int i; - itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); - if (!itn->tunnels) - return -ENOMEM; + for (i = 0; i < IP_TNL_HASH_SIZE; i++) + INIT_HLIST_HEAD(&itn->tunnels[i]); if (!ops) { itn->fb_tunnel_dev = NULL; return 0; } + memset(&parms, 0, sizeof(parms)); if (devname) strlcpy(parms.name, devname, IFNAMSIZ); rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + if (!IS_ERR(itn->fb_tunnel_dev)) + itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; rtnl_unlock(); - if (IS_ERR(itn->fb_tunnel_dev)) { - kfree(itn->tunnels); - return PTR_ERR(itn->fb_tunnel_dev); - } - return 0; + return PTR_RET(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) +static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, + struct rtnl_link_ops *ops) { + struct net *net = dev_net(itn->fb_tunnel_dev); + struct net_device *dev, *aux; int h; + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == ops) + unregister_netdevice_queue(dev, head); + for (h = 0; h < IP_TNL_HASH_SIZE; h++) { struct ip_tunnel *t; struct hlist_node *n; struct hlist_head *thead = &itn->tunnels[h]; hlist_for_each_entry_safe(t, n, thead, hash_node) - unregister_netdevice_queue(t->dev, head); + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, head); } if (itn->fb_tunnel_dev) unregister_netdevice_queue(itn->fb_tunnel_dev, head); } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn) +void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) { LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list); + ip_tunnel_destroy(itn, &list, ops); unregister_netdevice_many(&list); rtnl_unlock(); - kfree(itn->tunnels); } EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); @@ -929,23 +941,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p) { - struct ip_tunnel *t, *nt; - struct net *net = dev_net(dev); + struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) return -EINVAL; - nt = netdev_priv(dev); - t = ip_tunnel_find(itn, p, dev->type); if (t) { if (t->dev != dev) return -EEXIST; } else { - t = nt; + t = tunnel; if (dev->type != ARPHRD_ETHER) { unsigned int nflags = 0; @@ -984,6 +994,7 @@ int ip_tunnel_init(struct net_device *dev) } tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; @@ -994,8 +1005,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { - struct net *net = dev_net(dev); struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 17cc0ffa8c0..e805e7b3030 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -44,176 +44,10 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> -#define HASH_SIZE 16 -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1)) - static struct rtnl_link_ops vti_link_ops __read_mostly; static int vti_net_id __read_mostly; -struct vti_net { - struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_wc[1]; - struct ip_tunnel __rcu **tunnels[4]; - - struct net_device *fb_tunnel_dev; -}; - -static int vti_fb_tunnel_init(struct net_device *dev); static int vti_tunnel_init(struct net_device *dev); -static void vti_tunnel_setup(struct net_device *dev); -static void vti_dev_free(struct net_device *dev); -static int vti_tunnel_bind_dev(struct net_device *dev); - -#define VTI_XMIT(stats1, stats2) do { \ - int err; \ - int pkt_len = skb->len; \ - err = dst_output(skb); \ - if (net_xmit_eval(err) == 0) { \ - u64_stats_update_begin(&(stats1)->syncp); \ - (stats1)->tx_bytes += pkt_len; \ - (stats1)->tx_packets++; \ - u64_stats_update_end(&(stats1)->syncp); \ - } else { \ - (stats2)->tx_errors++; \ - (stats2)->tx_aborted_errors++; \ - } \ -} while (0) - - -static struct ip_tunnel *vti_tunnel_lookup(struct net *net, - __be32 remote, __be32 local) -{ - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); - struct ip_tunnel *t; - struct vti_net *ipn = net_generic(net, vti_net_id); - - for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) - if (local == t->parms.iph.saddr && - remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) - return t; - for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) - if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) - return t; - - for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) - if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) - return t; - - for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0]) - if (t && (t->dev->flags&IFF_UP)) - return t; - return NULL; -} - -static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn, - struct ip_tunnel_parm *parms) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - unsigned h = 0; - int prio = 0; - - if (remote) { - prio |= 2; - h ^= HASH(remote); - } - if (local) { - prio |= 1; - h ^= HASH(local); - } - return &ipn->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn, - struct ip_tunnel *t) -{ - return __vti_bucket(ipn, &t->parms); -} - -static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp; - struct ip_tunnel *iter; - - for (tp = vti_bucket(ipn, t); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; - } - } -} - -static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp = vti_bucket(ipn, t); - - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} - -static struct ip_tunnel *vti_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, - int create) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - struct ip_tunnel *t, *nt; - struct ip_tunnel __rcu **tp; - struct net_device *dev; - char name[IFNAMSIZ]; - struct vti_net *ipn = net_generic(net, vti_net_id); - - for (tp = __vti_bucket(ipn, parms); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) - return t; - } - if (!create) - return NULL; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else - strcpy(name, "vti%d"); - - dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup); - if (dev == NULL) - return NULL; - - dev_net_set(dev, net); - - nt = netdev_priv(dev); - nt->parms = *parms; - dev->rtnl_link_ops = &vti_link_ops; - - vti_tunnel_bind_dev(dev); - - if (register_netdevice(dev) < 0) - goto failed_free; - - dev_hold(dev); - vti_tunnel_link(ipn, nt); - return nt; - -failed_free: - free_netdev(dev); - return NULL; -} - -static void vti_tunnel_uninit(struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct vti_net *ipn = net_generic(net, vti_net_id); - - vti_tunnel_unlink(ipn, netdev_priv(dev)); - dev_put(dev); -} static int vti_err(struct sk_buff *skb, u32 info) { @@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info) * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. */ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); struct iphdr *iph = (struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; @@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info) err = -ENOENT; - t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); if (t == NULL) goto out; @@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); if (tunnel != NULL) { struct pcpu_tstats *tstats; @@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb) static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos; struct rtable *rt; /* Route to the other host */ @@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct iphdr *old_iph = ip_hdr(skb); __be32 dst = tiph->daddr; struct flowi4 fl4; + int err; if (skb->protocol != htons(ETH_P_IP)) goto tx_error; @@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); skb->dev = skb_dst(skb)->dev; - tstats = this_cpu_ptr(dev->tstats); - VTI_XMIT(tstats, &dev->stats); + err = dst_output(skb); + if (net_xmit_eval(err) == 0) + err = skb->len; + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; tx_error_icmp: @@ -379,198 +221,57 @@ tx_error: return NETDEV_TX_OK; } -static int vti_tunnel_bind_dev(struct net_device *dev) -{ - struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; - struct iphdr *iph; - - tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; - - if (iph->daddr) { - struct rtable *rt; - struct flowi4 fl4; - memset(&fl4, 0, sizeof(fl4)); - flowi4_init_output(&fl4, tunnel->parms.link, - be32_to_cpu(tunnel->parms.i_key), - RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, - IPPROTO_IPIP, 0, - iph->daddr, iph->saddr, 0, 0); - rt = ip_route_output_key(dev_net(dev), &fl4); - if (!IS_ERR(rt)) { - tdev = rt->dst.dev; - ip_rt_put(rt); - } - dev->flags |= IFF_POINTOPOINT; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); - - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + - sizeof(struct iphdr); - dev->mtu = tdev->mtu; - } - dev->iflink = tunnel->parms.link; - return dev->mtu; -} - static int vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct vti_net *ipn = net_generic(net, vti_net_id); - - switch (cmd) { - case SIOCGETTUNNEL: - t = NULL; - if (dev == ipn->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, - sizeof(p))) { - err = -EFAULT; - break; - } - t = vti_tunnel_locate(net, &p, 0); - } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof(p)); - p.i_flags |= GRE_KEY | VTI_ISVTI; - p.o_flags |= GRE_KEY; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - break; - - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; - err = -EINVAL; + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || p.iph.ihl != 5) - goto done; - - t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - if (((dev->flags&IFF_POINTOPOINT) && - !p.iph.daddr) || - (!(dev->flags&IFF_POINTOPOINT) && - p.iph.daddr)) { - err = -EINVAL; - break; - } - t = netdev_priv(dev); - vti_tunnel_unlink(ipn, t); - synchronize_net(); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - t->parms.i_key = p.i_key; - t->parms.o_key = p.o_key; - t->parms.iph.protocol = IPPROTO_IPIP; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - vti_tunnel_link(ipn, t); - netdev_state_change(dev); - } - } - - if (t) { - err = 0; - if (cmd == SIOCCHGTUNNEL) { - t->parms.i_key = p.i_key; - t->parms.o_key = p.o_key; - if (t->parms.link != p.link) { - t->parms.link = p.link; - vti_tunnel_bind_dev(dev); - netdev_state_change(dev); - } - } - p.i_flags |= GRE_KEY | VTI_ISVTI; - p.o_flags |= GRE_KEY; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, - sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; + return -EINVAL; + } - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == ipn->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, - sizeof(p))) - goto done; - err = -ENOENT; - - t = vti_tunnel_locate(net, &p, 0); - if (t == NULL) - goto done; - err = -EPERM; - if (t->dev == ipn->fb_tunnel_dev) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; + err = ip_tunnel_ioctl(dev, &p, cmd); + if (err) + return err; - default: - err = -EINVAL; + if (cmd != SIOCDELTUNNEL) { + p.i_flags |= GRE_KEY | VTI_ISVTI; + p.o_flags |= GRE_KEY; } -done: - return err; -} - -static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < 68 || new_mtu > 0xFFF8) - return -EINVAL; - dev->mtu = new_mtu; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, - .ndo_uninit = vti_tunnel_uninit, + .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_do_ioctl = vti_tunnel_ioctl, - .ndo_change_mtu = vti_tunnel_change_mtu, + .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, }; -static void vti_dev_free(struct net_device *dev) +static void vti_tunnel_setup(struct net_device *dev) { - free_percpu(dev->tstats); - free_netdev(dev); + dev->netdev_ops = &vti_netdev_ops; + ip_tunnel_setup(dev, vti_net_id); } -static void vti_tunnel_setup(struct net_device *dev) +static int vti_tunnel_init(struct net_device *dev) { - dev->netdev_ops = &vti_netdev_ops; - dev->destructor = vti_dev_free; + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + + memcpy(dev->dev_addr, &iph->saddr, 4); + memcpy(dev->broadcast, &iph->daddr, 4); dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; -} -static int vti_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); - - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); - memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; + return ip_tunnel_init(dev); } -static int __net_init vti_fb_tunnel_init(struct net_device *dev) +static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; - - dev_hold(dev); - rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); - return 0; } static struct xfrm_tunnel vti_handler __read_mostly = { @@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = { .priority = 1, }; -static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head) -{ - int prio; - - for (prio = 1; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - - t = rtnl_dereference(ipn->tunnels[prio][h]); - while (t != NULL) { - unregister_netdevice_queue(t->dev, head); - t = rtnl_dereference(t->next); - } - } - } -} - static int __net_init vti_init_net(struct net *net) { int err; - struct vti_net *ipn = net_generic(net, vti_net_id); - - ipn->tunnels[0] = ipn->tunnels_wc; - ipn->tunnels[1] = ipn->tunnels_l; - ipn->tunnels[2] = ipn->tunnels_r; - ipn->tunnels[3] = ipn->tunnels_r_l; - - ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), - "ip_vti0", - vti_tunnel_setup); - if (!ipn->fb_tunnel_dev) { - err = -ENOMEM; - goto err_alloc_dev; - } - dev_net_set(ipn->fb_tunnel_dev, net); - - err = vti_fb_tunnel_init(ipn->fb_tunnel_dev); - if (err) - goto err_reg_dev; - ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops; + struct ip_tunnel_net *itn; - err = register_netdev(ipn->fb_tunnel_dev); + err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) - goto err_reg_dev; + return err; + itn = net_generic(net, vti_net_id); + vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; - -err_reg_dev: - vti_dev_free(ipn->fb_tunnel_dev); -err_alloc_dev: - /* nothing */ - return err; } static void __net_exit vti_exit_net(struct net *net) { - struct vti_net *ipn = net_generic(net, vti_net_id); - LIST_HEAD(list); - - rtnl_lock(); - vti_destroy_tunnels(ipn, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + ip_tunnel_delete_net(itn, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit = vti_exit_net, .id = &vti_net_id, - .size = sizeof(struct vti_net), + .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[], static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *nt; - struct net *net = dev_net(dev); - struct vti_net *ipn = net_generic(net, vti_net_id); - int mtu; - int err; - - nt = netdev_priv(dev); - vti_netlink_parms(data, &nt->parms); - - if (vti_tunnel_locate(net, &nt->parms, 0)) - return -EEXIST; + struct ip_tunnel_parm parms; - mtu = vti_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - - err = register_netdevice(dev); - if (err) - goto out; - - dev_hold(dev); - vti_tunnel_link(ipn, nt); - -out: - return err; + vti_netlink_parms(data, &parms); + return ip_tunnel_newlink(dev, tb, &parms); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *t, *nt; - struct net *net = dev_net(dev); - struct vti_net *ipn = net_generic(net, vti_net_id); struct ip_tunnel_parm p; - int mtu; - - if (dev == ipn->fb_tunnel_dev) - return -EINVAL; - nt = netdev_priv(dev); vti_netlink_parms(data, &p); - - t = vti_tunnel_locate(net, &p, 0); - - if (t) { - if (t->dev != dev) - return -EEXIST; - } else { - t = nt; - - vti_tunnel_unlink(ipn, t); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - t->parms.i_key = p.i_key; - t->parms.o_key = p.o_key; - if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - } - vti_tunnel_link(ipn, t); - netdev_state_change(dev); - } - - if (t->parms.link != p.link) { - t->parms.link = p.link; - mtu = vti_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - netdev_state_change(dev); - } - - return 0; + return ip_tunnel_changelink(dev, tb, &p); } static size_t vti_get_size(const struct net_device *dev) @@ -865,7 +441,7 @@ static int __init vti_init(void) err = xfrm4_mode_tunnel_input_register(&vti_handler); if (err < 0) { unregister_pernet_device(&vti_net_ops); - pr_info(KERN_INFO "vti init: can't register tunnel\n"); + pr_info("vti init: can't register tunnel\n"); } err = rtnl_link_register(&vti_link_ops); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 51fc2a1dcdd..87bd2952c73 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -286,7 +286,6 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; @@ -437,7 +436,7 @@ static int __net_init ipip_init_net(struct net *net) static void __net_exit ipip_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 132a0966470..bacc0bcf48c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); -static int ip_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *cache, - int local); +static void ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, @@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ -static int ip_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *cache, - int local) +static void ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local) { int psend = -1; int vif, ct; @@ -1903,14 +1903,13 @@ last_forward: ipmr_queue_xmit(net, mrt, skb2, cache, psend); } else { ipmr_queue_xmit(net, mrt, skb, cache, psend); - return 0; + return; } } dont_forward: if (!local) kfree_skb(skb); - return 0; } static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 30e4de94056..00352ce0f0d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this, NF_CT_ASSERT(dev->ifindex != 0); nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); } return NOTIFY_DONE; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 746427c9e71..d7d9882d4ca 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 463bd127334..4a0335854b8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -/* Following RFC4293 items are displayed in /proc/net/netstat */ +/* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), @@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), + SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dd44e0ab600..41d84505a92 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -987,7 +987,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a9a54a23683..727f4365bcd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,7 +112,8 @@ #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -#define IP_MAX_MTU 0xFFF0 +/* IPv4 datagram length is stored into 16bit field (tot_len) */ +#define IP_MAX_MTU 0xFFFF #define RT_GC_TIMEOUT (300*HZ) @@ -435,12 +436,12 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); + return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } void rt_cache_flush(struct net *net) { - rt_genid_bump(net); + rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } - if (mtu > IP_MAX_MTU) - mtu = IP_MAX_MTU; - - return mtu; + return min_t(unsigned int, mtu, IP_MAX_MTU); } static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) @@ -1458,7 +1456,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->dst.output = ip_rt_bug; - rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; rth->rt_is_input= 1; @@ -1589,7 +1587,7 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); + rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; rth->rt_is_input = 1; @@ -1760,7 +1758,7 @@ local_input: rth->dst.tclassid = itag; #endif - rth->rt_genid = rt_genid(net); + rth->rt_genid = rt_genid_ipv4(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; rth->rt_is_input = 1; @@ -1945,7 +1943,7 @@ add: rth->dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; rth->rt_is_input = 0; @@ -2227,7 +2225,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; - rt->rt_genid = rt_genid(net); + rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; @@ -2665,7 +2663,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->rt_genid, 0); + atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 610e324348d..8ed7c32ae28 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -559,6 +559,13 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, }, { + .procname = "tcp_notsent_lowat", + .data = &sysctl_tcp_notsent_lowat, + .maxlen = sizeof(sysctl_tcp_notsent_lowat), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2f6c74861a..4e42c03859f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -410,10 +410,6 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -499,7 +495,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, @@ -510,7 +506,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * wspace test but before the flags are set, * IO signal will be lost. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } } else @@ -2638,6 +2634,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: err = -ENOPROTOOPT; break; @@ -2854,6 +2854,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8f7ef0ad80e..ab7bd35bb31 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -58,23 +58,22 @@ error: kfree(ctx); return err; } -/* Computes the fastopen cookie for the peer. - * The peer address is a 128 bits long (pad with zeros for IPv4). +/* Computes the fastopen cookie for the IP path. + * The path is a 128 bits long (pad with zeros for IPv4). * * The caller must check foc->len to determine if a valid cookie * has been generated successfully. */ -void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) +void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, + struct tcp_fastopen_cookie *foc) { - __be32 peer_addr[4] = { addr, 0, 0, 0 }; + __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, - foc->val, - (__u8 *)peer_addr); + crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; } rcu_read_unlock(); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28af45abe06..ec492eae0cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1048,6 +1048,7 @@ struct tcp_sacktag_state { int reord; int fack_count; int flag; + s32 rtt; /* RTT measured by SACKing never-retransmitted data */ }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - bool dup_sack, int pcount) + int dup_sack, int pcount, u32 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; + /* Pick the earliest sequence sacked for RTT */ + if (state->rtt < 0) + state->rtt = tcp_time_stamp - xmit_time; } if (sacked & TCPCB_LOST) { @@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount); + start_seq, end_seq, dup_sack, pcount, + TCP_SKB_CB(skb)->when); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, - tcp_skb_pcount(skb)); + tcp_skb_pcount(skb), + TCP_SKB_CB(skb)->when); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una) + u32 prior_snd_una, s32 *sack_rtt) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; + state.rtt = -1; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1737,6 +1744,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif + *sack_rtt = state.rtt; return state.flag; } @@ -1869,8 +1877,13 @@ void tcp_enter_loss(struct sock *sk, int how) } tcp_verify_left_out(tp); - tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + /* Timeout in disordered state after receiving substantial DUPACKs + * suggests that the degree of reordering is over-estimated. + */ + if (icsk->icsk_ca_state <= TCP_CA_Disorder && + tp->sacked_out >= sysctl_tcp_reordering) + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); @@ -2472,8 +2485,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) - tcp_moderate_cwnd(tp); } else { tcp_cwnd_reduction(sk, prior_unsacked, 0); } @@ -2792,65 +2803,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_xmit_retransmit_queue(sk); } -void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, + s32 seq_rtt, s32 sack_rtt) { - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; -} -EXPORT_SYMBOL(tcp_valid_rtt_meas); + const struct tcp_sock *tp = tcp_sk(sk); + + /* Prefer RTT measured from ACK's timing to TS-ECR. This is because + * broken middle-boxes or peers may corrupt TS-ECR fields. But + * Karn's algorithm forbids taking RTT if some retransmitted data + * is acked (RFC6298). + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + seq_rtt = -1; + + if (seq_rtt < 0) + seq_rtt = sack_rtt; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{ /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. - * * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> - * - * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overestimated rto. With timestamps - * samples are accepted even from very old segments: f.e., when rtt=1 - * increases to 8, we retransmit 5 times and after 8 seconds delayed - * answer arrives rto becomes 120 seconds! If at least one of segments - * in window is lost... Voila. --ANK (010210) */ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ + if (seq_rtt < 0) + return false; - if (flag & FLAG_RETRANS_DATA_ACKED) - return; + tcp_rtt_estimator(sk, seq_rtt); + tcp_set_rto(sk); - tcp_valid_rtt_meas(sk, seq_rtt); + /* RFC6298: only reset backoff on valid RTT measurement. */ + inet_csk(sk)->icsk_backoff = 0; + return true; } -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt) +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { - const struct tcp_sock *tp = tcp_sk(sk); - /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, flag); - else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, flag); + struct tcp_sock *tp = tcp_sk(sk); + s32 seq_rtt = -1; + + if (tp->lsndtime && !tp->total_retrans) + seq_rtt = tcp_time_stamp - tp->lsndtime; + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) @@ -2939,7 +2936,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una) + u32 prior_snd_una, s32 sack_rtt) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2978,8 +2975,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; @@ -3031,6 +3026,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; + if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || + (flag & FLAG_ACKED)) + tcp_rearm_rto(sk); + if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -3040,9 +3039,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_mtup_probe_success(sk); } - tcp_ack_update_rtt(sk, flag, seq_rtt); - tcp_rearm_rto(sk); - if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { @@ -3130,11 +3126,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } +/* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - const struct tcp_sock *tp = tcp_sk(sk); - return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !tcp_in_cwnd_reduction(sk); + if (tcp_in_cwnd_reduction(sk)) + return false; + + /* If reordering is high then always grow cwnd whenever data is + * delivered regardless of its ordering. Otherwise stay conservative + * and only grow cwnd on in-order delivery in Open state, and retain + * cwnd in Disordered state (RFC5681). A stretched ACK with + * new SACK or ECE mark may first advance cwnd here and later reduce + * cwnd in tcp_fastretrans_alert() based on more states. + */ + if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + return flag & FLAG_FORWARD_PROGRESS; + + return inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + flag & FLAG_DATA_ACKED; } /* Check that window update is acceptable. @@ -3274,6 +3283,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ + s32 sack_rtt = -1; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3330,7 +3340,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3349,21 +3360,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); acked -= tp->packets_out; + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, prior_in_flight); + if (tcp_ack_is_dubious(sk, flag)) { - /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); - } else { - if (flag & FLAG_DATA_ACKED) - tcp_cong_avoid(sk, ack, prior_in_flight); } - if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3402,7 +3410,8 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -5624,9 +5633,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { - tcp_synack_rtt_meas(sk, req); tp->total_retrans = req->num_retrans; - reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ @@ -5651,6 +5658,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_synack_rtt_meas(sk, req); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b299da5ff49..09d45d71897 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - u16 queue_mapping, - bool nocache) + u16 queue_mapping) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, 0, false); + int res = tcp_v4_send_synack(sk, NULL, req, 0); if (!res) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); @@ -890,7 +889,7 @@ bool tcp_syn_flood_action(struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned) { + if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { lopt->synflood_warned = 1; pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -1316,9 +1315,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; return true; } + if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || memcmp(&foc->val[0], &valid_foc->val[0], TCP_FASTOPEN_COOKIE_SIZE) != 0) @@ -1329,14 +1330,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; return true; } else if (foc->len == 0) { /* Client requesting a cookie */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); } else { /* Client sent a cookie with wrong size. Treat it * the same as invalid and return a valid one. */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); } return false; } @@ -1462,7 +1465,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * limitations, they conserve resources and peer is * evidently real one. */ - if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); if (!want_cookie) goto drop; @@ -1671,8 +1675,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - tcp_synack_rtt_meas(newsk, req); - newtp->total_retrans = req->num_retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ @@ -2605,7 +2607,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, long delta = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n", i, ireq->loc_addr, ntohs(inet_sk(sk)->inet_sport), @@ -2663,7 +2665,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", + "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, @@ -2802,6 +2804,7 @@ struct proto tcp_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ab1c0865852..58a3e69aef6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; + newtp->lsndtime = treq->snt_synack; + newtp->total_retrans = req->num_retrans; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ - if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) - tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; - else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ - tcp_rsk(req)->snt_synack = 0; - /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa8..884efff5b53 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index d4943f67aff..301a3effe57 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -54,12 +54,16 @@ static const char procname[] = "tcpprobe"; struct tcp_log { ktime_t tstamp; - __be32 saddr, daddr; - __be16 sport, dport; + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } src, dst; u16 length; u32 snd_nxt; u32 snd_una; u32 snd_wnd; + u32 rcv_wnd; u32 snd_cwnd; u32 ssthresh; u32 srtt; @@ -86,12 +90,36 @@ static inline int tcp_probe_avail(void) return bufsize - tcp_probe_used() - 1; } +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + +#if IS_ENABLED(CONFIG_IPV6) +#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \ + do { \ + struct ipv6_pinfo *pi6 = inet->pinet6; \ + si6.sin6_family = AF_INET6; \ + si6.sin6_port = inet->inet_##mem##port; \ + si6.sin6_addr = pi6->mem##addr; \ + si6.sin6_flowinfo = 0; /* No need here. */ \ + si6.sin6_scope_id = 0; /* No need here. */ \ + } while (0) +#else +#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \ + do { \ + memset(&si6, 0, sizeof(si6)); \ + } while (0) +#endif + /* * Hook inserted to be called before each receive packet. * Note: arguments must match tcp_rcv_established()! */ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned int len) + const struct tcphdr *th, unsigned int len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); @@ -107,15 +135,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_log *p = tcp_probe.log + tcp_probe.head; p->tstamp = ktime_get(); - p->saddr = inet->inet_saddr; - p->sport = inet->inet_sport; - p->daddr = inet->inet_daddr; - p->dport = inet->inet_dport; + switch (sk->sk_family) { + case AF_INET: + tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); + tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); + break; + case AF_INET6: + tcp_probe_copy_fl_to_si6(inet, p->src.v6, s); + tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d); + break; + default: + BUG(); + } + p->length = skb->len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; p->snd_cwnd = tp->snd_cwnd; p->snd_wnd = tp->snd_wnd; + p->rcv_wnd = tp->rcv_wnd; p->ssthresh = tcp_current_ssthresh(sk); p->srtt = tp->srtt >> 3; @@ -157,13 +195,11 @@ static int tcpprobe_sprint(char *tbuf, int n) = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); return scnprintf(tbuf, n, - "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", + "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec, - &p->saddr, ntohs(p->sport), - &p->daddr, ntohs(p->dport), - p->length, p->snd_nxt, p->snd_una, - p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); + &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, + p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } static ssize_t tcpprobe_read(struct file *file, char __user *buf, @@ -223,6 +259,13 @@ static __init int tcpprobe_init(void) { int ret = -ENOMEM; + /* Warning: if the function signature of tcp_rcv_established, + * has been changed, you also have to change the signature of + * jtcp_rcv_established, otherwise you end up right here! + */ + BUILD_BUG_ON(__same_type(tcp_rcv_established, + jtcp_rcv_established) == 0); + init_waitqueue_head(&tcp_probe.wait); spin_lock_init(&tcp_probe.lock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 766e6bab911..0b24508bcdc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); * @src: source IP address * @dst: destination IP address */ -static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); struct sk_buff *frags = skb_shinfo(skb)->frag_list; @@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = CSUM_MANGLED_0; } } +EXPORT_SYMBOL_GPL(udp4_hwcsum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { @@ -2158,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), |