diff options
-rw-r--r-- | include/net/ip_vs.h | 53 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_core.c | 29 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 586 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 18 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ftp.c | 7 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto.c | 8 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 52 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_sctp.c | 8 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_tcp.c | 52 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_udp.c | 51 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 503 |
12 files changed, 959 insertions, 410 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 6e8a6192e57..b7bbd6c28cf 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -25,7 +25,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> /* for ipv6_addr_copy */ -#ifdef CONFIG_IP_VS_NFCT +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> #endif @@ -136,24 +136,24 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, if (net_ratelimit()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) -#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) \ +#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \ - pp->debug_packet(pp, skb, ofs, msg); \ + pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) -#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) \ +#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \ net_ratelimit()) \ - pp->debug_packet(pp, skb, ofs, msg); \ + pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #else /* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) -#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) do {} while (0) -#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) do {} while (0) +#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) +#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif #define IP_VS_BUG() BUG() @@ -345,7 +345,7 @@ struct ip_vs_protocol { int (*app_conn_bind)(struct ip_vs_conn *cp); - void (*debug_packet)(struct ip_vs_protocol *pp, + void (*debug_packet)(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); @@ -409,6 +409,7 @@ struct ip_vs_conn { /* packet transmitter for different forwarding methods. If it mangles the packet, it must return NF_DROP or better NF_STOLEN, otherwise this must be changed to a sk_buff **. + NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); @@ -597,11 +598,19 @@ struct ip_vs_app { __be16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ - /* output hook: return false if can't linearize. diff set for TCP. */ + /* + * output hook: Process packet in inout direction, diff set for TCP. + * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, + * 2=Mangled but checksum was not updated + */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); - /* input hook: return false if can't linearize. diff set for TCP. */ + /* + * input hook: Process packet in outin direction, diff set for TCP. + * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, + * 2=Mangled but checksum was not updated + */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); @@ -819,7 +828,8 @@ extern int ip_vs_set_state_timeout(int *table, int num, const char *const *names, const char *name, int to); extern void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, int offset, const char *msg); extern struct ip_vs_protocol ip_vs_protocol_tcp; @@ -841,7 +851,8 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb); +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored); extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp); @@ -1013,6 +1024,24 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) return csum_partial(diff, sizeof(diff), oldsum); } +/* + * Forget current conntrack (unconfirmed) and attach notrack entry + */ +static inline void ip_vs_notrack(struct sk_buff *skb) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (!ct || !nf_ct_is_untracked(ct)) { + nf_reset(skb); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + } +#endif +} + #ifdef CONFIG_IP_VS_NFCT /* * Netfilter connection tracking diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index e2e00c4da88..0047923c1f2 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -462,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, return 0; } + if (manip == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + pr_debug("icmp_reply_translation: translating error %p manip %u " "dir %s\n", skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); @@ -496,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* Change outer to look the reply to an incoming packet * (proto 0 means don't invert per-proto part). */ - if (manip == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, skb, 0, &target, manip)) - return 0; - } + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, skb, 0, &target, manip)) + return 0; return 1; } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 1d1a529dbe2..e9adecdc8ca 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -563,6 +563,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + cp->flags &= ~IP_VS_CONN_F_FWD_MASK; } cp->flags |= conn_flags; cp->dest = dest; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e5fef7aef0d..b4e51e9c5a0 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -48,6 +48,7 @@ #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <linux/netfilter_ipv6.h> +#include <net/ip6_route.h> #endif #include <net/ip_vs.h> @@ -342,7 +343,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * Protocols supported: TCP, UDP */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -350,16 +352,44 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) __be16 _ports[2], *pptr; unsigned int flags; + *ignored = 1; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. It is risky + * for fwmark services but mostly for persistent services. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && + (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + + /* * Persistent service */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + *ignored = 0; return ip_vs_sched_persist(svc, skb, pptr); + } /* * Non-persistent service @@ -372,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) return NULL; } + *ignored = 0; + dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); @@ -498,35 +530,32 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - else + } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain and is used to avoid double NAT and confirmation when we do - * not want to keep the conntrack structure - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } -__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) { - return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; } static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -589,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } @@ -634,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_PARTIAL; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); } #endif @@ -679,11 +710,23 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + goto out; + } else +#endif + if ((sysctl_ip_vs_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto out; + /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; @@ -699,7 +742,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -714,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -757,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); offset += cih->ihl * 4; @@ -773,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; @@ -789,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) /* reassemble IP fragments */ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -832,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking outgoing ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -880,7 +927,7 @@ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int ihl) { - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); if (!skb_make_writable(skb, ihl)) goto drop; @@ -914,23 +961,24 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * if it came from this machine itself. So re-compute * the routing information. */ - if (sysctl_ip_vs_snat_reroute) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else + if (af == AF_INET6) { + if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + goto drop; + } else #endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - } + if ((sysctl_ip_vs_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); @@ -946,53 +994,54 @@ drop: } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int af; EnterFunction(11); - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - + /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum); - if (related) { - if (sysctl_ip_vs_snat_reroute && - NF_ACCEPT == verdict && - ip6_route_me_harder(skb)) - verdict = NF_DROP; + if (related) return verdict; - } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); - if (related) { - if (sysctl_ip_vs_snat_reroute && - NF_ACCEPT == verdict && - ip_route_me_harder(skb, RTN_LOCAL)) - verdict = NF_DROP; + if (related) return verdict; - } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } @@ -1003,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; } + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } else #endif if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); @@ -1026,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, */ cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP || - pp->protocol == IPPROTO_SCTP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if ((iph.protocol != IPPROTO_TCP && - iph.protocol != IPPROTO_SCTP) - || ((iph.protocol == IPPROTO_TCP - && !is_tcp_reset(skb, iph.len)) - || (iph.protocol == IPPROTO_SCTP - && !is_sctp_abort(skb, - iph.len)))) { + if (likely(cp)) + return handle_response(af, skb, pp, cp, iph.len); + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0); - else + if (af == AF_INET6) { + struct net *net = + dev_net(skb_dst(skb)->dev); + + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else #endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; } } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; } + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; +} + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; - return handle_response(af, skb, pp, cp, iph.len); + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; } +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif /* * Handle ICMP messages in the outside-to-inside direction (incoming). @@ -1098,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1142,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); offset += cih->ihl * 4; @@ -1176,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ + /* LOCALNODE from FORWARD hook is not supported */ + if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && + skb_rtable(skb)->rt_flags & RTCF_LOCAL) { + IP_VS_DBG(1, "%s(): " + "local delivery to %pI4 but in FORWARD\n", + __func__, &skb_rtable(skb)->rt_dst); + verdict = NF_DROP; + } out: __ip_vs_conn_put(cp); @@ -1197,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; unsigned int offset, verdict; union nf_inet_addr snet; + struct rt6_info *rt; *related = 1; /* reassemble IP fragments */ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1247,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -1275,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) IPPROTO_SCTP == cih->nexthdr) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ + /* LOCALNODE from FORWARD hook is not supported */ + if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && + (rt = (struct rt6_info *) skb_dst(skb)) && + rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "local delivery to %pI6 but in FORWARD\n", + __func__, &rt->rt6i_dst); + verdict = NF_DROP; + } __ip_vs_conn_put(cp); @@ -1289,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ret, restart, af, pkts; - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + int ret, restart, pkts; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); if (related) return verdict; @@ -1326,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); if (related) return verdict; @@ -1346,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { int v; - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_in: packet continues traversal as normal"); return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { @@ -1429,6 +1571,72 @@ out: return ret; } +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP @@ -1469,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 99, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(chan |