/* * net/sched/cls_flow.c Generic flow classifier * * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/pkt_cls.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/slab.h> #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> #endif struct flow_head { struct list_head filters; }; struct flow_filter { struct list_head list; struct tcf_exts exts; struct tcf_ematch_tree ematches; struct timer_list perturb_timer; u32 perturb_period; u32 handle; u32 nkeys; u32 keymask; u32 mode; u32 mask; u32 xor; u32 rshift; u32 addend; u32 divisor; u32 baseclass; u32 hashrnd; }; static const struct tcf_ext_map flow_ext_map = { .action = TCA_FLOW_ACT, .police = TCA_FLOW_POLICE, }; static inline u32 addr_fold(void *addr) { unsigned long a = (unsigned long)addr; return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); } static u32 flow_get_src(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): if (pskb_network_may_pull(skb, sizeof(struct iphdr))) return ntohl(ip_hdr(skb)->saddr); break; case htons(ETH_P_IPV6): if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]); break; } return addr_fold(skb->sk); } static u32 flow_get_dst(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): if (pskb_network_may_pull(skb, sizeof(struct iphdr))) return ntohl(ip_hdr(skb)->daddr); break; case htons(ETH_P_IPV6): if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]); break; } return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; } static u32 flow_get_proto(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return pskb_network_may_pull(skb, sizeof(struct iphdr)) ? ip_hdr(skb)->protocol : 0; case htons(ETH_P_IPV6): return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ? ipv6_hdr(skb)->nexthdr : 0; default: return 0; } } static u32 flow_get_proto_src(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); if (iph->frag_off & htons(IP_MF | IP_OFFSET)) break; poff = proto_ports_offset(iph->protocol); if (poff >= 0 && pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) { iph = ip_hdr(skb); return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + poff)); } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); poff = proto_ports_offset(iph->nexthdr); if (poff >= 0 && pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) { iph = ipv6_hdr(skb); return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + poff)); } break; } } return addr_fold(skb->sk); } static u32 flow_get_proto_dst(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); if (iph->frag_off & htons(IP_MF | IP_OFFSET)) break; poff = proto_ports_offset(iph->protocol); if (poff >= 0 && pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { iph = ip_hdr(skb); return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2 + poff)); } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); poff = proto_ports_offset(iph->nexthdr); if (poff >= 0 && pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) { iph = ipv6_hdr(skb); return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + poff + 2)); } break; } } return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; } static u32 flow_get_iif(const struct sk_buff *skb) { return skb->skb_iif; } static u32 flow_get_priority(const struct sk_buff *skb) { return skb->priority; } static u32 flow_get_mark(const struct sk_buff *skb) { return skb->mark; } static u32 flow_get_nfct(const struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) return addr_fold(skb->nfct); #else return 0; #endif } #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #define CTTUPLE(skb, member) \ ({ \ enum ip_conntrack_info ctinfo; \ struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \ if (ct == NULL) \ goto fallback; \ ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \ }) #else #define CTTUPLE(skb, member) \ ({ \ goto fallback; \ 0; \ }) #endif static u32 flow_get_nfct_src(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: return flow_get_src(skb); } static u32 flow_get_nfct_dst(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: return flow_get_dst(skb); } static u32 flow_get_nfct_proto_src(struct sk_buff *skb) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: return flow_get_proto_src(skb); } static u32 flow_get_nfct_proto_dst(struct sk_buff *skb) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: return flow_get_proto_dst(skb); } static u32 flow_get_rtclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID if (skb_dst(skb)) return skb_dst(skb)->tclassid; #endif return 0; } static u32 flow_get_skuid(const struct sk_buff *skb) { if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) return skb->sk->sk_socket->file->f_cred->fsuid; return 0; } static u32 flow_get_skgid(const struct sk_buff *skb) { if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) return skb->sk->sk_socket->file->f_cred->fsgid; return 0; } static u32 flow_get_vlan_tag(const struct sk_buff *skb) { u16 uninitialized_var(tag); if (vlan_get_tag(skb, &tag) < 0) return 0; return tag & VLAN_VID_MASK; } static u32 flow_get_rxhash(struct sk_buff *skb) { return skb_get_rxhash(skb); } static u32 flow_key_get(struct sk_buff *skb, int key) { switch (key) { case FLOW_KEY_SRC: return flow_get_src(skb); case FLOW_KEY_DST: return flow_get_dst(skb); case FLOW_KEY_PROTO: return flow_get_proto(skb); case FLOW_KEY_PROTO_SRC: return flow_get_proto_src(skb); case FLOW_KEY_PROTO_DST: return flow_get_proto_dst(skb); case FLOW_KEY_IIF: return flow_get_iif(skb); case FLOW_KEY_PRIORITY: return flow_get_priority(skb); case FLOW_KEY_MARK: return flow_get_mark(skb); case FLOW_KEY_NFCT: return flow_get_nfct(skb); case FLOW_KEY_NFCT_SRC: return flow_get_nfct_src(skb); case FLOW_KEY_NFCT_DST: return flow_get_nfct_dst(skb); case FLOW_KEY_NFCT_PROTO_SRC: return flow_get_nfct_proto_src(skb); case FLOW_KEY_NFCT_PROTO_DST: return flow_get_nfct_proto_dst(skb); case FLOW_KEY_RTCLASSID: return flow_get_rtclassid(skb); case FLOW_KEY_SKUID: return flow_get_skuid(skb); case FLOW_KEY_SKGID: return flow_get_skgid(skb); case FLOW_KEY_VLAN_TAG: return flow_get_vlan_tag(skb); case FLOW_KEY_RXHASH: return flow_get_rxhash(skb); default: WARN_ON(1); return 0; } } static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { struct flow_head *head = tp->root; struct flow_filter *f; u32 keymask; u32 classid; unsigned int n, key; int r; list_for_each_entry(f, &head->filters, list) { u32 keys[f->nkeys]; if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; keymask = f->keymask; for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; keymask &= ~(1 << key); keys[n] = flow_key_get(skb, key); } if (f->mode == FLOW_MODE_HASH) classid = jhash2(keys, f->nkeys, f->hashrnd); else { classid = keys[0]; classid = (classid & f->mask) ^ f->xor; classid = (classid >> f->rshift) + f->addend; } if (f->divisor) classid %= f->divisor; res->class = 0; res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid); r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } return -1; } static void flow_perturbation(unsigned long arg) { struct flow_filter *f = (struct flow_filter *)arg; get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) mod_timer(&f->perturb_timer, jiffies + f->perturb_period); } static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, [TCA_FLOW_DIVISOR] = { .type = NLA_U32 }, [TCA_FLOW_ACT] = { .type = NLA_NESTED }, [TCA_FLOW_POLICE] = { .type = NLA_NESTED }, [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED }, [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; static int flow_change(struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { struct flow_head *head = tp->root; struct flow_filter *f; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FLOW_MAX + 1]; struct tcf_exts e; struct tcf_ematch_tree t; unsigned int nkeys = 0; unsigned int perturb_period = 0; u32 baseclass = 0; u32 keymask = 0; u32 mode; int err; if (opt == NULL) return -EINVAL; err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy); if (err < 0) return err; if (tb[TCA_FLOW_BASECLASS]) { baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]); if (TC_H_MIN(baseclass) == 0) return -EINVAL; } if (tb[TCA_FLOW_KEYS]) { keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); nkeys = hweight32(keymask); if (nkeys == 0) return -EINVAL; if (fls(keymask) - 1 > FLOW_KEY_MAX) return -EOPNOTSUPP; } err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); if (err < 0) return err; err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); if (err < 0) goto err1; f = (struct flow_filter *)*arg; if (f != NULL) { err = -EINVAL; if (f->handle != handle && handle) goto err2; mode = f->mode; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (mode == FLOW_MODE_HASH) perturb_period = f->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } } else { err = -EINVAL; if (!handle) goto err2; if (!tb[TCA_FLOW_KEYS]) goto err2; mode = FLOW_MODE_MAP; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } if (TC_H_MAJ(baseclass) == 0) baseclass = TC_H_MAKE(tp->q->handle, baseclass); if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); err = -ENOBUFS; f = kzalloc(sizeof(*f), GFP_KERNEL); if (f == NULL) goto err2; f->handle = handle; f->mask = ~0U; get_random_bytes(&f->hashrnd, 4); f->perturb_timer.function = flow_perturbation; f->perturb_timer.data = (unsigned long)f; init_timer_deferrable(&f->perturb_timer); } tcf_exts_change(tp, &f->exts, &e); tcf_em_tree_change(tp, &f->ematches, &t); tcf_tree_lock(tp); if (tb[TCA_FLOW_KEYS]) { f->keymask = keymask; f->nkeys = nkeys; } f->mode = mode; if (tb[TCA_FLOW_MASK]) f->mask = nla_get_u32(tb[TCA_FLOW_MASK]); if (tb[TCA_FLOW_XOR]) f->xor = nla_get_u32(tb[TCA_FLOW_XOR]); if (tb[TCA_FLOW_RSHIFT]) f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); if (tb[TCA_FLOW_ADDEND]) f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); if (tb[TCA_FLOW_DIVISOR]) f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); if (baseclass) f->baseclass = baseclass; f->perturb_period = perturb_period; del_timer(&f->perturb_timer); if (perturb_period) mod_timer(&f->perturb_timer, jiffies + perturb_period); if (*arg == 0) list_add_tail(&f->list, &head->filters); tcf_tree_unlock(tp); *arg = (unsigned long)f; return 0; err2: tcf_em_tree_destroy(tp, &t); err1: tcf_exts_destroy(tp, &e); return err; } static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f) { del_timer_sync(&f->perturb_timer); tcf_exts_destroy(tp, &f->exts); tcf_em_tree_destroy(tp, &f->ematches); kfree(f); } static int flow_delete(struct tcf_proto *tp, unsigned long arg) { struct flow_filter *f = (struct flow_filter *)arg; tcf_tree_lock(tp); list_del(&f->list); tcf_tree_unlock(tp); flow_destroy_filter(tp, f); return 0; } static int flow_init(struct tcf_proto *tp) { struct flow_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->filters); tp->root = head; return 0; } static void flow_destroy(struct tcf_proto *tp) { struct flow_head *head = tp->root; struct flow_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) { list_del(&f->list); flow_destroy_filter(tp, f); } kfree(head); } static unsigned long flow_get(struct tcf_proto *tp, u32 handle) { struct flow_head *head = tp->root; struct flow_filter *f; list_for_each_entry(f, &head->filters, list) if (f->handle == handle) return (unsigned long)f; return 0; } static void flow_put(struct tcf_proto *tp, unsigned long f) { } static int flow_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct flow_filter *f = (struct flow_filter *)fh; struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask); NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode); if (f->mask != ~0 || f->xor != 0) { NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask); NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor); } if (f->rshift) NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift); if (f->addend) NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend); if (f->divisor) NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor); if (f->baseclass) NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass); if (f->perturb_period) NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ); if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) goto nla_put_failure; #ifdef CONFIG_NET_EMATCH if (f->ematches.hdr.nmatches && tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) goto nla_put_failure; #endif nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, nest); return -1; } static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct flow_head *head = tp->root; struct flow_filter *f; list_for_each_entry(f, &head->filters, list) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long)f, arg) < 0) { arg->stop = 1; break; } skip: arg->count++; } } static struct tcf_proto_ops cls_flow_ops __read_mostly = { .kind = "flow", .classify = flow_classify, .init = flow_init, .destroy = flow_destroy, .change = flow_change, .delete = flow_delete, .get = flow_get, .put = flow_put, .dump = flow_dump, .walk = flow_walk, .owner = THIS_MODULE, }; static int __init cls_flow_init(void) { return register_tcf_proto_ops(&cls_flow_ops); } static void __exit cls_flow_exit(void) { unregister_tcf_proto_ops(&cls_flow_ops); } module_init(cls_flow_init); module_exit(cls_flow_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("TC flow classifier");