diff options
Diffstat (limited to 'net/ipv4/icmp.c')
| -rw-r--r-- | net/ipv4/icmp.c | 853 |
1 files changed, 412 insertions, 441 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a944e8053e2..42b7bcf8045 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,9 +1,7 @@ /* * NET3: Implementation of the ICMP protocol layer. * - * Alan Cox, <alan@redhat.com> - * - * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $ + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -64,6 +62,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/types.h> #include <linux/jiffies.h> @@ -76,6 +76,7 @@ #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> +#include <linux/slab.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> @@ -84,15 +85,17 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/raw.h> +#include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/init.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/inet_common.h> +#include <net/ip_fib.h> /* * Build xmit assembly blocks @@ -108,20 +111,13 @@ struct icmp_bxm { __be32 times[3]; } data; int head_len; - struct ip_options replyopts; - unsigned char optbuf[40]; + struct ip_options_data replyopts; }; -/* - * Statistics - */ -DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; -DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly; - /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ -struct icmp_err icmp_err_convert[] = { +const struct icmp_err icmp_err_convert[] = { { .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ .fatal = 0, @@ -187,29 +183,7 @@ struct icmp_err icmp_err_convert[] = { .fatal = 1, }, }; - -/* Control parameters for ECHO replies. */ -int sysctl_icmp_echo_ignore_all __read_mostly; -int sysctl_icmp_echo_ignore_broadcasts __read_mostly = 1; - -/* Control parameter - ignore bogus broadcast responses? */ -int sysctl_icmp_ignore_bogus_error_responses __read_mostly = 1; - -/* - * Configurable global rate limit. - * - * ratelimit defines tokens/packet consumed for dst->rate_token bucket - * ratemask defines which icmp types are ratelimited by setting - * it's bit position. - * - * default: - * dest unreachable (3), source quench (4), - * time exceeded (11), parameter problem (12) - */ - -int sysctl_icmp_ratelimit __read_mostly = 1 * HZ; -int sysctl_icmp_ratemask __read_mostly = 0x1818; -int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly; +EXPORT_SYMBOL(icmp_err_convert); /* * ICMP control array. This specifies what to do with each ICMP. @@ -229,72 +203,43 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; * * On SMP we have one ICMP socket per-cpu. */ -static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; -#define icmp_socket __get_cpu_var(__icmp_socket) +static struct sock *icmp_sk(struct net *net) +{ + return net->ipv4.icmp_sk[smp_processor_id()]; +} -static inline int icmp_xmit_lock(void) +static inline struct sock *icmp_xmit_lock(struct net *net) { + struct sock *sk; + local_bh_disable(); - if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) { + sk = icmp_sk(net); + + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ local_bh_enable(); - return 1; + return NULL; } - return 0; + return sk; } -static inline void icmp_xmit_unlock(void) +static inline void icmp_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.slock); } /* * Send an ICMP frame. */ -/* - * Check transmit rate limitation for given message. - * The rate information is held in the destination cache now. - * This function is generic and could be used for other purposes - * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. - * - * Note that the same dst_entry fields are modified by functions in - * route.c too, but these work for packet destinations while xrlim_allow - * works for icmp destinations. This means the rate limiting information - * for one "ip object" is shared - and these ICMPs are twice limited: - * by source and by destination. - * - * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits - * - * Shared between ICMPv4 and ICMPv6. - */ -#define XRLIM_BURST_FACTOR 6 -int xrlim_allow(struct dst_entry *dst, int timeout) -{ - unsigned long now, token = dst->rate_tokens; - int rc = 0; - - now = jiffies; - token += now - dst->rate_last; - dst->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; - if (token >= timeout) { - token -= timeout; - rc = 1; - } - dst->rate_tokens = token; - return rc; -} - -static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) { - struct dst_entry *dst = &rt->u.dst; - int rc = 1; + struct dst_entry *dst = &rt->dst; + bool rc = true; if (type > NR_ICMP_TYPES) goto out; @@ -308,8 +253,13 @@ static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & sysctl_icmp_ratemask) - rc = xrlim_allow(dst, sysctl_icmp_ratelimit); + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { + struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + rc = inet_peer_xrlim_allow(peer, + net->ipv4.sysctl_icmp_ratelimit); + if (peer) + inet_putpeer(peer); + } out: return rc; } @@ -317,10 +267,10 @@ out: /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ -void icmp_out_count(unsigned char type) +void icmp_out_count(struct net *net, unsigned char type) { - ICMPMSGOUT_INC_STATS(type); - ICMP_INC_STATS(ICMP_MIB_OUTMSGS); + ICMPMSGOUT_INC_STATS(net, type); + ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); } /* @@ -344,21 +294,25 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, } static void icmp_push_reply(struct icmp_bxm *icmp_param, - struct ipcm_cookie *ipc, struct rtable *rt) + struct flowi4 *fl4, + struct ipcm_cookie *ipc, struct rtable **rt) { + struct sock *sk; struct sk_buff *skb; - if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, + sk = icmp_sk(dev_net((*rt)->dst.dev)); + if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, - ipc, rt, MSG_DONTWAIT) < 0) - ip_flush_pending_frames(icmp_socket->sk); - else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { + ipc, rt, MSG_DONTWAIT) < 0) { + ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS); + ip_flush_pending_frames(sk); + } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; - skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) { + skb_queue_walk(&sk->sk_write_queue, skb1) { csum = csum_add(csum, skb1->csum); } csum = csum_partial_copy_nocheck((void *)&icmp_param->data, @@ -366,7 +320,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, icmp_param->head_len, csum); icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(icmp_socket->sk); + ip_push_pending_frames(sk, fl4); } } @@ -376,46 +330,149 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct sock *sk = icmp_socket->sk; - struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; - struct rtable *rt = (struct rtable *)skb->dst; - __be32 daddr; + struct rtable *rt = skb_rtable(skb); + struct net *net = dev_net(rt->dst.dev); + struct flowi4 fl4; + struct sock *sk; + struct inet_sock *inet; + __be32 daddr, saddr; + u32 mark = IP4_REPLY_MARK(net, skb->mark); - if (ip_options_echo(&icmp_param->replyopts, skb)) + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; - if (icmp_xmit_lock()) + sk = icmp_xmit_lock(net); + if (sk == NULL) return; + inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; - daddr = ipc.addr = rt->rt_src; + sk->sk_mark = mark; + daddr = ipc.addr = ip_hdr(skb)->saddr; + saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; - if (icmp_param->replyopts.optlen) { - ipc.opt = &icmp_param->replyopts; - if (ipc.opt->srr) - daddr = icmp_param->replyopts.faddr; + ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; + + if (icmp_param->replyopts.opt.opt.optlen) { + ipc.opt = &icmp_param->replyopts.opt; + if (ipc.opt->opt.srr) + daddr = icmp_param->replyopts.opt.opt.faddr; } - { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = rt->rt_spec_dst, - .tos = RT_TOS(ip_hdr(skb)->tos) } }, - .proto = IPPROTO_ICMP }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(rt->u.dst.dev->nd_net, &rt, &fl)) - goto out_unlock; - } - if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = saddr; + fl4.flowi4_mark = mark; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_proto = IPPROTO_ICMP; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + goto out_unlock; + if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) - icmp_push_reply(icmp_param, &ipc, rt); + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: - icmp_xmit_unlock(); + icmp_xmit_unlock(sk); } +static struct rtable *icmp_route_lookup(struct net *net, + struct flowi4 *fl4, + struct sk_buff *skb_in, + const struct iphdr *iph, + __be32 saddr, u8 tos, u32 mark, + int type, int code, + struct icmp_bxm *param) +{ + struct rtable *rt, *rt2; + struct flowi4 fl4_dec; + int err; + + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = (param->replyopts.opt.opt.srr ? + param->replyopts.opt.opt.faddr : iph->saddr); + fl4->saddr = saddr; + fl4->flowi4_mark = mark; + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_proto = IPPROTO_ICMP; + fl4->fl4_icmp_type = type; + fl4->fl4_icmp_code = code; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + rt = __ip_route_output_key(net, fl4); + if (IS_ERR(rt)) + return rt; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + if (!IS_ERR(rt)) { + if (rt != rt2) + return rt; + } else if (PTR_ERR(rt) == -EPERM) { + rt = NULL; + } else + return rt; + + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); + if (err) + goto relookup_failed; + + if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, &fl4_dec); + if (IS_ERR(rt2)) + err = PTR_ERR(rt2); + } else { + struct flowi4 fl4_2 = {}; + unsigned long orefdst; + + fl4_2.daddr = fl4_dec.saddr; + rt2 = ip_route_output_key(net, &fl4_2); + if (IS_ERR(rt2)) { + err = PTR_ERR(rt2); + goto relookup_failed; + } + /* Ugh! */ + orefdst = skb_in->_skb_refdst; /* save old refdst */ + err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, + RT_TOS(tos), rt2->dst.dev); + + dst_release(&rt2->dst); + rt2 = skb_rtable(skb_in); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ + } + + if (err) + goto relookup_failed; + + rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, + flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + if (!IS_ERR(rt2)) { + dst_release(&rt->dst); + memcpy(fl4, &fl4_dec, sizeof(*fl4)); + rt = rt2; + } else if (PTR_ERR(rt2) == -EPERM) { + if (rt) + dst_release(&rt->dst); + return rt2; + } else { + err = PTR_ERR(rt2); + goto relookup_failed; + } + return rt; + +relookup_failed: + if (rt) + return rt; + return ERR_PTR(err); +} /* * Send an ICMP message in response to a situation @@ -432,16 +489,19 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct iphdr *iph; int room; - struct icmp_bxm icmp_param; - struct rtable *rt = (struct rtable *)skb_in->dst; + struct icmp_bxm *icmp_param; + struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; + struct flowi4 fl4; __be32 saddr; u8 tos; + u32 mark; struct net *net; + struct sock *sk; if (!rt) goto out; - net = rt->u.dst.dev->nd_net; + net = dev_net(rt->dst.dev); /* * Find the original header. It is expected to be valid, of course. @@ -451,7 +511,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph = ip_hdr(skb_in); if ((u8 *)iph < skb_in->head || - (skb_in->network_header + sizeof(*iph)) > skb_in->tail) + (skb_network_header(skb_in) + sizeof(*iph)) > + skb_tail_pointer(skb_in)) goto out; /* @@ -505,9 +566,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - if (icmp_xmit_lock()) + icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); + if (!icmp_param) return; + sk = icmp_xmit_lock(net); + if (sk == NULL) + goto out_free; + /* * Construct source address and options. */ @@ -516,21 +582,24 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; - if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(net, rt->fl.iif); + rcu_read_lock(); + if (rt_is_input_route(rt) && + net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) + dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); - if (dev) { + if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); - dev_put(dev); - } else + else saddr = 0; + rcu_read_unlock(); } tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param.replyopts, skb_in)) + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) goto out_unlock; @@ -538,139 +607,97 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param.data.icmph.type = type; - icmp_param.data.icmph.code = code; - icmp_param.data.icmph.un.gateway = info; - icmp_param.data.icmph.checksum = 0; - icmp_param.skb = skb_in; - icmp_param.offset = skb_network_offset(skb_in); - inet_sk(icmp_socket->sk)->tos = tos; + icmp_param->data.icmph.type = type; + icmp_param->data.icmph.code = code; + icmp_param->data.icmph.un.gateway = info; + icmp_param->data.icmph.checksum = 0; + icmp_param->skb = skb_in; + icmp_param->offset = skb_network_offset(skb_in); + inet_sk(sk)->tos = tos; + sk->sk_mark = mark; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts; - - { - struct flowi fl = { - .nl_u = { - .ip4_u = { - .daddr = icmp_param.replyopts.srr ? - icmp_param.replyopts.faddr : - iph->saddr, - .saddr = saddr, - .tos = RT_TOS(tos) - } - }, - .proto = IPPROTO_ICMP, - .uli_u = { - .icmpt = { - .type = type, - .code = code - } - } - }; - int err; - struct rtable *rt2; - - security_skb_classify_flow(skb_in, &fl); - if (__ip_route_output_key(net, &rt, &fl)) - goto out_unlock; - - /* No need to clone since we're just using its address. */ - rt2 = rt; - - err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); - switch (err) { - case 0: - if (rt != rt2) - goto route_done; - break; - case -EPERM: - rt = NULL; - break; - default: - goto out_unlock; - } - - if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) - goto ende; - - if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) - err = __ip_route_output_key(net, &rt2, &fl); - else { - struct flowi fl2 = {}; - struct dst_entry *odst; - - fl2.fl4_dst = fl.fl4_src; - if (ip_route_output_key(net, &rt2, &fl2)) - goto ende; - - /* Ugh! */ - odst = skb_in->dst; - err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->u.dst.dev); - - dst_release(&rt2->u.dst); - rt2 = (struct rtable *)skb_in->dst; - skb_in->dst = odst; - } - - if (err) - goto ende; - - err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, - XFRM_LOOKUP_ICMP); - if (err == -ENOENT) { - if (!rt) - goto out_unlock; - goto route_done; - } - - dst_release(&rt->u.dst); - rt = rt2; - - if (err) - goto out_unlock; - } + ipc.opt = &icmp_param->replyopts.opt; + ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; + + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, + type, code, icmp_param); + if (IS_ERR(rt)) + goto out_unlock; -route_done: - if (!icmpv4_xrlim_allow(rt, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->u.dst); + room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); - icmp_param.data_len = skb_in->len - icmp_param.offset; - if (icmp_param.data_len > room) - icmp_param.data_len = room; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data_len = skb_in->len - icmp_param->offset; + if (icmp_param->data_len > room) + icmp_param->data_len = room; + icmp_param->head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &ipc, rt); + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: - icmp_xmit_unlock(); + icmp_xmit_unlock(sk); +out_free: + kfree(icmp_param); out:; } +EXPORT_SYMBOL(icmp_send); + + +static void icmp_socket_deliver(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + return; + + raw_icmp_error(skb, protocol, info); + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); +} +static bool icmp_tag_validation(int proto) +{ + bool ok; + + rcu_read_lock(); + ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; + rcu_read_unlock(); + return ok; +} /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * ICMP_PARAMETERPROB. */ static void icmp_unreach(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; struct icmphdr *icmph; - int hash, protocol; - struct net_protocol *ipprot; - u32 info = 0; struct net *net; + u32 info = 0; - net = skb->dst->dev->nd_net; + net = dev_net(skb_dst(skb)->dev); /* * Incomplete header ? @@ -682,7 +709,7 @@ static void icmp_unreach(struct sk_buff *skb) goto out_err; icmph = icmp_hdr(skb); - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; @@ -695,22 +722,28 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " - "fragmentation needed " - "and DF set.\n", - NIPQUAD(iph->daddr)); - } else { - info = ip_rt_frag_needed(net, iph, - ntohs(icmph->un.frag.mtu)); - if (!info) + /* for documentation of the ip_no_pmtu_disc + * values please see + * Documentation/networking/ip-sysctl.txt + */ + switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + default: + LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), + &iph->daddr); + break; + case 2: + goto out; + case 3: + if (!icmp_tag_validation(iph->protocol)) goto out; + /* fall through */ + case 0: + info = ntohs(icmph->un.frag.mtu); } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source " - "Route Failed.\n", - NIPQUAD(iph->daddr)); + LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), + &iph->daddr); break; default: break; @@ -732,50 +765,27 @@ static void icmp_unreach(struct sk_buff *skb) */ /* - * Check the other end isnt violating RFC 1122. Some routers send + * Check the other end isn't violating RFC 1122. Some routers send * bogus responses to broadcast frames. If you see this message * first check your netmask matches at both ends, if it does then * get the other vendor to fix their kit. */ - if (!sysctl_icmp_ignore_bogus_error_responses && + if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { - if (net_ratelimit()) - printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " - "type %u, code %u " - "error to a broadcast: %u.%u.%u.%u on %s\n", - NIPQUAD(ip_hdr(skb)->saddr), - icmph->type, icmph->code, - NIPQUAD(iph->daddr), - skb->dev->name); + net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", + &ip_hdr(skb)->saddr, + icmph->type, icmph->code, + &iph->daddr, skb->dev->name); goto out; } - /* Checkin full IP header plus 8 bytes of protocol to - * avoid additional coding at protocol handlers. - */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) - goto out; - - iph = (struct iphdr *)skb->data; - protocol = iph->protocol; - - /* - * Deliver ICMP message to raw sockets. Pretty useless feature? - */ - raw_icmp_error(skb, protocol, info); - - hash = protocol & (MAX_INET_PROTOS - 1); - rcu_read_lock(); - ipprot = rcu_dereference(inet_protos[hash]); - if (ipprot && ipprot->err_handler) - ipprot->err_handler(skb, info); - rcu_read_unlock(); + icmp_socket_deliver(skb, info); out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto out; } @@ -786,37 +796,15 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { - struct iphdr *iph; - - if (skb->len < sizeof(struct iphdr)) - goto out_err; + if (skb->len < sizeof(struct iphdr)) { + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + return; + } - /* - * Get the copied header of the packet that caused the redirect - */ if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - - iph = (struct iphdr *)skb->data; + return; - switch (icmp_hdr(skb)->code & 7) { - case ICMP_REDIR_NET: - case ICMP_REDIR_NETTOS: - /* - * As per RFC recommendations now handle it as a host redirect. - */ - case ICMP_REDIR_HOST: - case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, - icmp_hdr(skb)->un.gateway, - iph->saddr, skb->dev); - break; - } -out: - return; -out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); - goto out; + icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); } /* @@ -833,7 +821,10 @@ out_err: static void icmp_echo(struct sk_buff *skb) { - if (!sysctl_icmp_echo_ignore_all) { + struct net *net; + + net = dev_net(skb_dst(skb)->dev); + if (!net->ipv4.sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; icmp_param.data.icmph = *icmp_hdr(skb); @@ -855,7 +846,7 @@ static void icmp_echo(struct sk_buff *skb) */ static void icmp_timestamp(struct sk_buff *skb) { - struct timeval tv; + struct timespec tv; struct icmp_bxm icmp_param; /* * Too short. @@ -866,9 +857,9 @@ static void icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - do_gettimeofday(&tv); - icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 + - tv.tv_usec / 1000); + getnstimeofday(&tv); + icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + + tv.tv_nsec / NSEC_PER_MSEC); icmp_param.data.times[2] = icmp_param.data.times[1]; if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); @@ -883,94 +874,10 @@ static void icmp_timestamp(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); goto out; } - -/* - * Handle ICMP_ADDRESS_MASK requests. (RFC950) - * - * RFC1122 (3.2.2.9). A host MUST only send replies to - * ADDRESS_MASK requests if it's been configured as an address mask - * agent. Receiving a request doesn't constitute implicit permission to - * act as one. Of course, implementing this correctly requires (SHOULD) - * a way to turn the functionality on and off. Another one for sysctl(), - * I guess. -- MS - * - * RFC1812 (4.3.3.9). A router MUST implement it. - * A router SHOULD have switch turning it on/off. - * This switch MUST be ON by default. - * - * Gratuitous replies, zero-source replies are not implemented, - * that complies with RFC. DO NOT implement them!!! All the idea - * of broadcast addrmask replies as specified in RFC950 is broken. - * The problem is that it is not uncommon to have several prefixes - * on one physical interface. Moreover, addrmask agent can even be - * not aware of existing another prefixes. - * If source is zero, addrmask agent cannot choose correct prefix. - * Gratuitous mask announcements suffer from the same problem. - * RFC1812 explains it, but still allows to use ADDRMASK, - * that is pretty silly. --ANK - * - * All these rules are so bizarre, that I removed kernel addrmask - * support at all. It is wrong, it is obsolete, nobody uses it in - * any case. --ANK - * - * Furthermore you can do it with a usermode address agent program - * anyway... - */ - -static void icmp_address(struct sk_buff *skb) -{ -#if 0 - if (net_ratelimit()) - printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); -#endif -} - -/* - * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain - * loudly if an inconsistency is found. - */ - -static void icmp_address_reply(struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb->dst; - struct net_device *dev = skb->dev; - struct in_device *in_dev; - struct in_ifaddr *ifa; - - if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) - goto out; - - in_dev = in_dev_get(dev); - if (!in_dev) - goto out; - rcu_read_lock(); - if (in_dev->ifa_list && - IN_DEV_LOG_MARTIANS(in_dev) && - IN_DEV_FORWARD(in_dev)) { - __be32 _mask, *mp; - - mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); - BUG_ON(mp == NULL); - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - if (*mp == ifa->ifa_mask && - inet_ifa_match(rt->rt_src, ifa)) - break; - } - if (!ifa && net_ratelimit()) { - printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from " - "%s/%u.%u.%u.%u\n", - NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); - } - } - rcu_read_unlock(); - in_dev_put(in_dev); -out:; -} - static void icmp_discard(struct sk_buff *skb) { } @@ -981,12 +888,14 @@ static void icmp_discard(struct sk_buff *skb) int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb_rtable(skb); + struct net *net = dev_net(rt->dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + struct sec_path *sp = skb_sec_path(skb); int nh; - if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & + if (!(sp && sp->xvec[sp->len - 1]->props.flags & XFRM_STATE_ICMP)) goto drop; @@ -1002,25 +911,17 @@ int icmp_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto error; - } + if (skb_checksum_simple_validate(skb)) + goto csum_error; if (!pskb_pull(skb, sizeof(*icmph))) goto error; icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(icmph->type); + ICMPMSGIN_INC_STATS_BH(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1044,7 +945,7 @@ int icmp_rcv(struct sk_buff *skb) */ if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && - sysctl_icmp_echo_ignore_broadcasts) { + net->ipv4.sysctl_icmp_echo_ignore_broadcasts) { goto error; } if (icmph->type != ICMP_ECHO && @@ -1060,17 +961,43 @@ int icmp_rcv(struct sk_buff *skb) drop: kfree_skb(skb); return 0; +csum_error: + ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS); error: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto drop; } +void icmp_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + int offset = iph->ihl<<2; + struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; + struct net *net = dev_net(skb->dev); + + /* + * Use ping_err to handle all icmp errors except those + * triggered by ICMP_ECHOREPLY which sent from kernel. + */ + if (icmph->type != ICMP_ECHOREPLY) { + ping_err(skb, offset, info); + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + else if (type == ICMP_REDIRECT) + ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); +} + /* * This table is the definition of how we handle ICMP. */ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { - .handler = icmp_discard, + .handler = ping_rcv, }, [1] = { .handler = icmp_discard, @@ -1132,48 +1059,92 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { .handler = icmp_discard, }, [ICMP_ADDRESS] = { - .handler = icmp_address, + .handler = icmp_discard, }, [ICMP_ADDRESSREPLY] = { - .handler = icmp_address_reply, + .handler = icmp_discard, }, }; -void __init icmp_init(struct net_proto_family *ops) +static void __net_exit icmp_sk_exit(struct net *net) { - struct inet_sock *inet; int i; - for_each_possible_cpu(i) { - int err; + for_each_possible_cpu(i) + inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); + kfree(net->ipv4.icmp_sk); + net->ipv4.icmp_sk = NULL; +} + +static int __net_init icmp_sk_init(struct net *net) +{ + int i, err; - err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, - &per_cpu(__icmp_socket, i)); + net->ipv4.icmp_sk = + kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + if (net->ipv4.icmp_sk == NULL) + return -ENOMEM; + for_each_possible_cpu(i) { + struct sock *sk; + + err = inet_ctl_sock_create(&sk, PF_INET, + SOCK_RAW, IPPROTO_ICMP, net); if (err < 0) - panic("Failed to create the ICMP control socket.\n"); + goto fail; - per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; + net->ipv4.icmp_sk[i] = sk; /* Enough space for 2 64K ICMP packets, including - * sk_buff struct overhead. + * sk_buff/skb_shared_info struct overhead. */ - per_cpu(__icmp_socket, i)->sk->sk_sndbuf = - (2 * ((64 * 1024) + sizeof(struct sk_buff))); - - inet = inet_sk(per_cpu(__icmp_socket, i)->sk); - inet->uc_ttl = -1; - inet->pmtudisc = IP_PMTUDISC_DONT; + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); - /* Unhash it so that IP input processing does not even - * see it, we do not wish this socket to see incoming - * packets. + /* + * Speedup sock_wfree() */ - per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk); + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; } + + /* Control parameters for ECHO replies. */ + net->ipv4.sysctl_icmp_echo_ignore_all = 0; + net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1; + + /* Control parameter - ignore bogus broadcast responses? */ + net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1; + + /* + * Configurable global rate limit. + * + * ratelimit defines tokens/packet consumed for dst->rate_token + * bucket ratemask defines which icmp types are ratelimited by + * setting it's bit position. + * + * default: + * dest unreachable (3), source quench (4), + * time exceeded (11), parameter problem (12) + */ + + net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; + net->ipv4.sysctl_icmp_ratemask = 0x1818; + net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + + return 0; + +fail: + for_each_possible_cpu(i) + inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); + kfree(net->ipv4.icmp_sk); + return err; } -EXPORT_SYMBOL(icmp_err_convert); -EXPORT_SYMBOL(icmp_send); -EXPORT_SYMBOL(icmp_statistics); -EXPORT_SYMBOL(xrlim_allow); +static struct pernet_operations __net_initdata icmp_sk_ops = { + .init = icmp_sk_init, + .exit = icmp_sk_exit, +}; + +int __init icmp_init(void) +{ + return register_pernet_subsys(&icmp_sk_ops); +} |
