diff options
Diffstat (limited to 'net/openvswitch')
| -rw-r--r-- | net/openvswitch/Makefile | 2 | ||||
| -rw-r--r-- | net/openvswitch/actions.c | 18 | ||||
| -rw-r--r-- | net/openvswitch/datapath.c | 1601 | ||||
| -rw-r--r-- | net/openvswitch/datapath.h | 26 | ||||
| -rw-r--r-- | net/openvswitch/dp_notify.c | 18 | ||||
| -rw-r--r-- | net/openvswitch/flow.c | 1718 | ||||
| -rw-r--r-- | net/openvswitch/flow.h | 185 | ||||
| -rw-r--r-- | net/openvswitch/flow_netlink.c | 1576 | ||||
| -rw-r--r-- | net/openvswitch/flow_netlink.h | 60 | ||||
| -rw-r--r-- | net/openvswitch/flow_table.c | 647 | ||||
| -rw-r--r-- | net/openvswitch/flow_table.h | 86 | ||||
| -rw-r--r-- | net/openvswitch/vport-gre.c | 25 | ||||
| -rw-r--r-- | net/openvswitch/vport-internal_dev.c | 4 | ||||
| -rw-r--r-- | net/openvswitch/vport-netdev.c | 16 | ||||
| -rw-r--r-- | net/openvswitch/vport-netdev.h | 1 | ||||
| -rw-r--r-- | net/openvswitch/vport-vxlan.c | 10 | ||||
| -rw-r--r-- | net/openvswitch/vport.c | 20 | ||||
| -rw-r--r-- | net/openvswitch/vport.h | 9 |
18 files changed, 3277 insertions, 2745 deletions
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index ea36e99089a..3591cb5dae9 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -9,6 +9,8 @@ openvswitch-y := \ datapath.o \ dp_notify.o \ flow.o \ + flow_netlink.o \ + flow_table.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 65cfaa81607..e70d8b18e96 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -134,8 +134,8 @@ static int set_eth_addr(struct sk_buff *skb, skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); - memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN); - memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src); + ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst); ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); @@ -165,7 +165,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, } csum_replace4(&nh->check, *addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); *addr = new_addr; } @@ -199,7 +199,7 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, if (recalculate_csum) update_ipv6_checksum(skb, l4_proto, addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); memcpy(addr, new_addr, sizeof(__be32[4])); } @@ -296,7 +296,7 @@ static void set_tp_port(struct sk_buff *skb, __be16 *port, { inet_proto_csum_replace2(check, skb, *port, new_port, 0); *port = new_port; - skb->rxhash = 0; + skb_clear_hash(skb); } static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) @@ -310,7 +310,7 @@ static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) uh->check = CSUM_MANGLED_0; } else { *port = new_port; - skb->rxhash = 0; + skb_clear_hash(skb); } } @@ -381,7 +381,7 @@ static int set_sctp(struct sk_buff *skb, /* Carry any checksum errors through. */ sh->checksum = old_csum ^ old_correct_csum ^ new_csum; - skb->rxhash = 0; + skb_clear_hash(skb); } return 0; @@ -445,7 +445,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { switch (nla_type(a)) { case OVS_SAMPLE_ATTR_PROBABILITY: - if (net_random() >= nla_get_u32(a)) + if (prandom_u32() >= nla_get_u32(a)) return 0; break; @@ -551,6 +551,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); + if (unlikely(err)) /* skb already freed. */ + return err; break; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2aa13bd7f2b..9db4bf6740d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -44,32 +44,54 @@ #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> -#include <linux/workqueue.h> +#include <linux/genetlink.h> +#include <net/genetlink.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "datapath.h" #include "flow.h" +#include "flow_table.h" +#include "flow_netlink.h" #include "vport-internal_dev.h" #include "vport-netdev.h" +int ovs_net_id __read_mostly; -#define REHASH_FLOW_INTERVAL (10 * 60 * HZ) -static void rehash_flow_table(struct work_struct *work); -static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table); +static struct genl_family dp_packet_genl_family; +static struct genl_family dp_flow_genl_family; +static struct genl_family dp_datapath_genl_family; -int ovs_net_id __read_mostly; +static struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP +}; + +static struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP +}; + +struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP +}; -static void ovs_notify(struct sk_buff *skb, struct genl_info *info, - struct genl_multicast_group *grp) +/* Check if need to build a reply message. + * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ +static bool ovs_must_notify(struct genl_info *info, + const struct genl_multicast_group *grp) { - genl_notify(skb, genl_info_net(info), info->snd_portid, - grp->id, info->nlhdr, GFP_KERNEL); + return info->nlhdr->nlmsg_flags & NLM_F_ECHO || + netlink_has_listeners(genl_info_net(info)->genl_sock, 0); +} + +static void ovs_notify(struct genl_family *family, + struct sk_buff *skb, struct genl_info *info) +{ + genl_notify(family, skb, genl_info_net(info), info->snd_portid, + 0, info->nlhdr, GFP_KERNEL); } /** @@ -112,10 +134,9 @@ int lockdep_ovsl_is_held(void) #endif static struct vport *new_vport(const struct vport_parms *); -static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *, +static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *); -static int queue_userspace_packet(struct net *, int dp_ifindex, - struct sk_buff *, +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *); /* Must be called with rcu_read_lock or ovs_mutex. */ @@ -137,7 +158,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -const char *ovs_dp_name(const struct datapath *dp) +static const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -165,7 +186,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -178,6 +198,7 @@ static struct hlist_head *vport_hash_bucket(const struct datapath *dp, return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; } +/* Called with ovs_mutex or RCU read lock. */ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; @@ -225,6 +246,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) struct dp_stats_percpu *stats; struct sw_flow_key key; u64 *stats_counter; + u32 n_mask_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); @@ -237,7 +259,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } /* Look up flow. */ - flow = ovs_flow_lookup(rcu_dereference(dp->table), &key); + flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -254,32 +276,22 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) OVS_CB(skb)->flow = flow; OVS_CB(skb)->pkt_key = &key; - stats_counter = &stats->n_hit; - ovs_flow_used(OVS_CB(skb)->flow, skb); + ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); ovs_execute_actions(dp, skb); + stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); (*stats_counter)++; - u64_stats_update_end(&stats->sync); + stats->n_mask_hit += n_mask_hit; + u64_stats_update_end(&stats->syncp); } -static struct genl_family dp_packet_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = sizeof(struct ovs_header), - .name = OVS_PACKET_FAMILY, - .version = OVS_PACKET_VERSION, - .maxattr = OVS_PACKET_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; - int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) { struct dp_stats_percpu *stats; - int dp_ifindex; int err; if (upcall_info->portid == 0) { @@ -287,16 +299,10 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, goto err; } - dp_ifindex = get_dpifindex(dp); - if (!dp_ifindex) { - err = -ENODEV; - goto err; - } - if (!skb_is_gso(skb)) - err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + err = queue_userspace_packet(dp, skb, upcall_info); else - err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + err = queue_gso_packets(dp, skb, upcall_info); if (err) goto err; @@ -305,15 +311,14 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err: stats = this_cpu_ptr(dp->stats_percpu); - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); stats->n_lost++; - u64_stats_update_end(&stats->sync); + u64_stats_update_end(&stats->syncp); return err; } -static int queue_gso_packets(struct net *net, int dp_ifindex, - struct sk_buff *skb, +static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) { unsigned short gso_type = skb_shinfo(skb)->gso_type; @@ -322,14 +327,14 @@ static int queue_gso_packets(struct net *net, int dp_ifindex, struct sk_buff *segs, *nskb; int err; - segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false); + segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); /* Queue all of the segments. */ skb = segs; do { - err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info); + err = queue_userspace_packet(dp, skb, upcall_info); if (err) break; @@ -382,11 +387,11 @@ static size_t key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } -static size_t upcall_msg_size(const struct sk_buff *skb, - const struct nlattr *userdata) +static size_t upcall_msg_size(const struct nlattr *userdata, + unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ @@ -396,15 +401,24 @@ static size_t upcall_msg_size(const struct sk_buff *skb, return size; } -static int queue_userspace_packet(struct net *net, int dp_ifindex, - struct sk_buff *skb, +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; struct sk_buff *user_skb; /* to be queued to userspace */ struct nlattr *nla; - int err; + struct genl_info info = { + .dst_sk = ovs_dp_get_net(dp)->genl_sock, + .snd_portid = upcall_info->portid, + }; + size_t len; + unsigned int hlen; + int err, dp_ifindex; + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) + return -ENODEV; if (vlan_tx_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); @@ -424,7 +438,22 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, goto out; } - user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC); + /* Complete checksum if needed */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto out; + + /* Older versions of OVS user space enforce alignment of the last + * Netlink attribute to NLA_ALIGNTO which would require extensive + * padding logic. Only perform zerocopy if padding is not required. + */ + if (dp->user_features & OVS_DP_F_UNALIGNED) + hlen = skb_zerocopy_headlen(skb); + else + hlen = skb->len; + + len = upcall_msg_size(upcall_info->userdata, hlen); + user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -435,7 +464,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb); + ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -443,418 +472,36 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); - nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); - - skb_copy_and_csum_dev(skb, nla_data(nla)); - - genlmsg_end(user_skb, upcall); - err = genlmsg_unicast(net, user_skb, upcall_info->portid); - -out: - kfree_skb(nskb); - return err; -} - -/* Called with ovs_mutex. */ -static int flush_flows(struct datapath *dp) -{ - struct flow_table *old_table; - struct flow_table *new_table; - - old_table = ovsl_dereference(dp->table); - new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS); - if (!new_table) - return -ENOMEM; - - rcu_assign_pointer(dp->table, new_table); - - ovs_flow_tbl_destroy(old_table, true); - return 0; -} - -static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len) -{ - - struct sw_flow_actions *acts; - int new_acts_size; - int req_size = NLA_ALIGN(attr_len); - int next_offset = offsetof(struct sw_flow_actions, actions) + - (*sfa)->actions_len; - - if (req_size <= (ksize(*sfa) - next_offset)) + /* Only reserve room for attribute header, packet data is added + * in skb_zerocopy() */ + if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { + err = -ENOBUFS; goto out; - - new_acts_size = ksize(*sfa) * 2; - - if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) - return ERR_PTR(-EMSGSIZE); - new_acts_size = MAX_ACTIONS_BUFSIZE; - } - - acts = ovs_flow_actions_alloc(new_acts_size); - if (IS_ERR(acts)) - return (void *)acts; - - memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); - acts->actions_len = (*sfa)->actions_len; - kfree(*sfa); - *sfa = acts; - -out: - (*sfa)->actions_len += req_size; - return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); -} - -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) -{ - struct nlattr *a; - - a = reserve_sfa_size(sfa, nla_attr_size(len)); - if (IS_ERR(a)) - return PTR_ERR(a); - - a->nla_type = attrtype; - a->nla_len = nla_attr_size(len); - - if (data) - memcpy(nla_data(a), data, len); - memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); - - return 0; -} - -static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype) -{ - int used = (*sfa)->actions_len; - int err; - - err = add_action(sfa, attrtype, NULL, 0); - if (err) - return err; - - return used; -} - -static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset) -{ - struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset); - - a->nla_len = sfa->actions_len - st_offset; -} - -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa); - -static int validate_and_copy_sample(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) -{ - const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; - const struct nlattr *probability, *actions; - const struct nlattr *a; - int rem, start, err, st_acts; - - memset(attrs, 0, sizeof(attrs)); - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) - return -EINVAL; - attrs[type] = a; } - if (rem) - return -EINVAL; + nla->nla_len = nla_attr_size(skb->len); - probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; - if (!probability || nla_len(probability) != sizeof(u32)) - return -EINVAL; - - actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; - if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) - return -EINVAL; - - /* validation done, copy sample action. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); - if (start < 0) - return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32)); - if (err) - return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); - if (st_acts < 0) - return st_acts; - - err = validate_and_copy_actions(actions, key, depth + 1, sfa); + err = skb_zerocopy(user_skb, skb, skb->len, hlen); if (err) - return err; - - add_nested_action_end(*sfa, st_acts); - add_nested_action_end(*sfa, start); + goto out; - return 0; -} + /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; -static int validate_tp_port(const struct sw_flow_key *flow_key) -{ - if (flow_key->eth.type == htons(ETH_P_IP)) { - if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) - return 0; - } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { - if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) - return 0; + if (plen > 0) + memset(skb_put(user_skb, plen), 0, plen); } - return -EINVAL; -} - -static int validate_and_copy_set_tun(const struct nlattr *attr, - struct sw_flow_actions **sfa) -{ - struct sw_flow_match match; - struct sw_flow_key key; - int err, start; + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; - ovs_match_init(&match, &key, NULL); - err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false); + err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); +out: if (err) - return err; - - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); - if (start < 0) - return start; - - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); - add_nested_action_end(*sfa, start); - + skb_tx_error(skb); + kfree_skb(nskb); return err; } -static int validate_set(const struct nlattr *a, - const struct sw_flow_key *flow_key, - struct sw_flow_actions **sfa, - bool *set_tun) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - - /* There can be only one key in a action */ - if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) - return -EINVAL; - - if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type] != nla_len(ovs_key) && - ovs_key_lens[key_type] != -1)) - return -EINVAL; - - switch (key_type) { - const struct ovs_key_ipv4 *ipv4_key; - const struct ovs_key_ipv6 *ipv6_key; - int err; - - case OVS_KEY_ATTR_PRIORITY: - case OVS_KEY_ATTR_SKB_MARK: - case OVS_KEY_ATTR_ETHERNET: - break; - - case OVS_KEY_ATTR_TUNNEL: - *set_tun = true; - err = validate_and_copy_set_tun(a, sfa); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) - return -EINVAL; - - if (!flow_key->ip.proto) - return -EINVAL; - - ipv4_key = nla_data(ovs_key); - if (ipv4_key->ipv4_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv4_key->ipv4_frag != flow_key->ip.frag) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) - return -EINVAL; - - if (!flow_key->ip.proto) - return -EINVAL; - - ipv6_key = nla_data(ovs_key); - if (ipv6_key->ipv6_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv6_key->ipv6_frag != flow_key->ip.frag) - return -EINVAL; - - if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_TCP: - if (flow_key->ip.proto != IPPROTO_TCP) - return -EINVAL; - - return validate_tp_port(flow_key); - - case OVS_KEY_ATTR_UDP: - if (flow_key->ip.proto != IPPROTO_UDP) - return -EINVAL; - - return validate_tp_port(flow_key); - - case OVS_KEY_ATTR_SCTP: - if (flow_key->ip.proto != IPPROTO_SCTP) - return -EINVAL; - - return validate_tp_port(flow_key); - - default: - return -EINVAL; - } - - return 0; -} - -static int validate_userspace(const struct nlattr *attr) -{ - static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { - [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, - [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, - }; - struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; - int error; - - error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, - attr, userspace_policy); - if (error) - return error; - - if (!a[OVS_USERSPACE_ATTR_PID] || - !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) - return -EINVAL; - - return 0; -} - -static int copy_action(const struct nlattr *from, - struct sw_flow_actions **sfa) -{ - int totlen = NLA_ALIGN(from->nla_len); - struct nlattr *to; - - to = reserve_sfa_size(sfa, from->nla_len); - if (IS_ERR(to)) - return PTR_ERR(to); - - memcpy(to, from, totlen); - return 0; -} - -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) -{ - const struct nlattr *a; - int rem, err; - - if (depth >= SAMPLE_ACTION_DEPTH) - return -EOVERFLOW; - - nla_for_each_nested(a, attr, rem) { - /* Expected argument lengths, (u32)-1 for variable length. */ - static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { - [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), - [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, - [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), - [OVS_ACTION_ATTR_POP_VLAN] = 0, - [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 - }; - const struct ovs_action_push_vlan *vlan; - int type = nla_type(a); - bool skip_copy; - - if (type > OVS_ACTION_ATTR_MAX || - (action_lens[type] != nla_len(a) && - action_lens[type] != (u32)-1)) - return -EINVAL; - - skip_copy = false; - switch (type) { - case OVS_ACTION_ATTR_UNSPEC: - return -EINVAL; - - case OVS_ACTION_ATTR_USERSPACE: - err = validate_userspace(a); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_OUTPUT: - if (nla_get_u32(a) >= DP_MAX_PORTS) - return -EINVAL; - break; - - - case OVS_ACTION_ATTR_POP_VLAN: - break; - - case OVS_ACTION_ATTR_PUSH_VLAN: - vlan = nla_data(a); - if (vlan->vlan_tpid != htons(ETH_P_8021Q)) - return -EINVAL; - if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) - return -EINVAL; - break; - - case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); - if (err) - return err; - skip_copy = true; - break; - - default: - return -EINVAL; - } - if (!skip_copy) { - err = copy_action(a, sfa); - if (err) - return err; - } - } - - if (rem > 0) - return -EINVAL; - - return 0; -} - -static void clear_stats(struct sw_flow *flow) -{ - flow->used = 0; - flow->tcp_flags = 0; - flow->packet_count = 0; - flow->byte_count = 0; -} - static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; @@ -902,15 +549,16 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); + err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; - acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); + acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) goto err_flow_free; - err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); + err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + &flow->key, 0, &acts); rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; @@ -950,7 +598,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, }; -static struct genl_ops dp_packet_genl_ops[] = { +static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = packet_policy, @@ -958,15 +606,30 @@ static struct genl_ops dp_packet_genl_ops[] = { } }; -static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) +static struct genl_family dp_packet_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_packet_genl_ops, + .n_ops = ARRAY_SIZE(dp_packet_genl_ops), +}; + +static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, + struct ovs_dp_megaflow_stats *mega_stats) { - struct flow_table *table; int i; - table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held()); - stats->n_flows = ovs_flow_tbl_count(table); + memset(mega_stats, 0, sizeof(*mega_stats)); + + stats->n_flows = ovs_flow_tbl_count(&dp->table); + mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; + for_each_possible_cpu(i) { const struct dp_stats_percpu *percpu_stats; struct dp_stats_percpu local_stats; @@ -975,130 +638,17 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; + mega_stats->n_mask_hit += local_stats.n_mask_hit; } } -static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { - [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, -}; - -static struct genl_family dp_flow_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = sizeof(struct ovs_header), - .name = OVS_FLOW_FAMILY, - .version = OVS_FLOW_VERSION, - .maxattr = OVS_FLOW_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; - -static struct genl_multicast_group ovs_dp_flow_multicast_group = { - .name = OVS_FLOW_MCGROUP -}; - -static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb); -static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) -{ - const struct nlattr *a; - struct nlattr *start; - int err = 0, rem; - - start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); - if (!start) - return -EMSGSIZE; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - struct nlattr *st_sample; - - switch (type) { - case OVS_SAMPLE_ATTR_PROBABILITY: - if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a))) - return -EMSGSIZE; - break; - case OVS_SAMPLE_ATTR_ACTIONS: - st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); - if (!st_sample) - return -EMSGSIZE; - err = actions_to_attr(nla_data(a), nla_len(a), skb); - if (err) - return err; - nla_nest_end(skb, st_sample); - break; - } - } - - nla_nest_end(skb, start); - return err; -} - -static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - struct nlattr *start; - int err; - - switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: - start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); - if (!start) - return -EMSGSIZE; - - err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); - if (err) - return err; - nla_nest_end(skb, start); - break; - default: - if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) - return -EMSGSIZE; - break; - } - - return 0; -} - -static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb) -{ - const struct nlattr *a; - int rem, err; - - nla_for_each_attr(a, attr, len, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_ACTION_ATTR_SET: - err = set_action_to_attr(a, skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: - err = sample_action_to_attr(a, skb); - if (err) - return err; - break; - default: - if (nla_put(skb, type, nla_len(a), nla_data(a))) - return -EMSGSIZE; - break; - } - } - - return 0; -} - static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) @@ -1110,33 +660,32 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ } -/* Called with ovs_mutex. */ -static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { const int skb_orig_len = skb->len; struct nlattr *start; struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; struct ovs_header *ovs_header; struct nlattr *nla; - unsigned long used; - u8 tcp_flags; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; - ovs_header->dp_ifindex = get_dpifindex(dp); + ovs_header->dp_ifindex = dp_ifindex; /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->unmasked_key, - &flow->unmasked_key, skb); + err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); if (err) goto error; nla_nest_end(skb, nla); @@ -1145,30 +694,24 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb); + err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); if (err) goto error; nla_nest_end(skb, nla); - spin_lock_bh(&flow->lock); - used = flow->used; - stats.n_packets = flow->packet_count; - stats.n_bytes = flow->byte_count; - tcp_flags = flow->tcp_flags; - spin_unlock_bh(&flow->lock); + ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) goto nla_put_failure; if (stats.n_packets && - nla_put(skb, OVS_FLOW_ATTR_STATS, - sizeof(struct ovs_flow_stats), &stats)) + nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) goto nla_put_failure; - if (tcp_flags && - nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags)) + if ((u8)ntohs(tcp_flags) && + nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) goto nla_put_failure; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if @@ -1185,10 +728,10 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, if (start) { const struct sw_flow_actions *sf_acts; - sf_acts = rcu_dereference_check(flow->sf_acts, - lockdep_ovsl_is_held()); + sf_acts = rcu_dereference_ovsl(flow->sf_acts); + err = ovs_nla_put_actions(sf_acts->actions, + sf_acts->actions_len, skb); - err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb); if (!err) nla_nest_end(skb, start); else { @@ -1209,138 +752,128 @@ error: return err; } -static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow) +/* May not be called with RCU read lock. */ +static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, + struct genl_info *info, + bool always) { - const struct sw_flow_actions *sf_acts; + struct sk_buff *skb; - sf_acts = ovsl_dereference(flow->sf_acts); + if (!always && !ovs_must_notify(info, &ovs_dp_flow_multicast_group)) + return NULL; - return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL); + skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + return skb; } -static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, - struct datapath *dp, - u32 portid, u32 seq, u8 cmd) +/* Called with ovs_mutex. */ +static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, + int dp_ifindex, + struct genl_info *info, u8 cmd, + bool always) { struct sk_buff *skb; int retval; - skb = ovs_flow_cmd_alloc_info(flow); - if (!skb) - return ERR_PTR(-ENOMEM); + skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info, + always); + if (!skb || IS_ERR(skb)) + return skb; - retval = ovs_flow_cmd_fill_info(flow, dp, skb, portid, seq, 0, cmd); + retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, + info->snd_portid, info->snd_seq, 0, + cmd); BUG_ON(retval < 0); return skb; } -static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) +static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key, masked_key; - struct sw_flow *flow = NULL; + struct sw_flow *flow, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; - struct flow_table *table; - struct sw_flow_actions *acts = NULL; + struct sw_flow_actions *acts; struct sw_flow_match match; int error; - /* Extract key. */ + /* Must have key and actions. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) goto error; + if (!a[OVS_FLOW_ATTR_ACTIONS]) + goto error; - ovs_match_init(&match, &key, &mask); - error = ovs_match_from_nlattrs(&match, - a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); - if (error) + /* Most of the time we need to allocate a new flow, do it before + * locking. + */ + new_flow = ovs_flow_alloc(); + if (IS_ERR(new_flow)) { + error = PTR_ERR(new_flow); goto error; + } + + /* Extract key. */ + ovs_match_init(&match, &new_flow->unmasked_key, &mask); + error = ovs_nla_get_match(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + if (error) + goto err_kfree_flow; + + ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); /* Validate actions. */ - if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) - goto error; + acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); + error = PTR_ERR(acts); + if (IS_ERR(acts)) + goto err_kfree_flow; - ovs_flow_key_mask(&masked_key, &key, &mask); - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], - &masked_key, 0, &acts); - if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree; - } - } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { - error = -EINVAL; - goto error; + error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, + 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); + goto err_kfree_acts; + } + + reply = ovs_flow_cmd_alloc_info(acts, info, false); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - error = -ENODEV; - if (!dp) + if (unlikely(!dp)) { + error = -ENODEV; goto err_unlock_ovs; - - table = ovsl_dereference(dp->table); - + } /* Check if this is a duplicate flow */ - flow = ovs_flow_lookup(table, &key); - if (!flow) { - struct sw_flow_mask *mask_p; - /* Bail out if we're not allowed to create a new flow. */ - error = -ENOENT; - if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) - goto err_unlock_ovs; - - /* Expand table, if necessary, to make room. */ - if (ovs_flow_tbl_need_to_expand(table)) { - struct flow_table *new_table; + flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key); + if (likely(!flow)) { + rcu_assign_pointer(new_flow->sf_acts, acts); - new_table = ovs_flow_tbl_expand(table); - if (!IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_destroy(table, true); - table = ovsl_dereference(dp->table); - } - } - - /* Allocate flow. */ - flow = ovs_flow_alloc(); - if (IS_ERR(flow)) { - error = PTR_ERR(flow); + /* Put flow in bucket. */ + error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); + if (unlikely(error)) { + acts = NULL; goto err_unlock_ovs; } - clear_stats(flow); - - flow->key = masked_key; - flow->unmasked_key = key; - - /* Make sure mask is unique in the system */ - mask_p = ovs_sw_flow_mask_find(table, &mask); - if (!mask_p) { - /* Allocate a new mask if none exsits. */ - mask_p = ovs_sw_flow_mask_alloc(); - if (!mask_p) - goto err_flow_free; - mask_p->key = mask.key; - mask_p->range = mask.range; - ovs_sw_flow_mask_insert(table, mask_p); - } - ovs_sw_flow_mask_add_ref(mask_p); - flow->mask = mask_p; - rcu_assign_pointer(flow->sf_acts, acts); - - /* Put flow in bucket. */ - ovs_flow_insert(table, flow); - - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(new_flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW); + BUG_ON(error < 0); + } + ovs_unlock(); } else { - /* We found a matching flow. */ struct sw_flow_actions *old_acts; /* Bail out if we're not allowed to modify an existing flow. @@ -1349,47 +882,153 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) * request. We also accept NLM_F_EXCL in case that bug ever * gets fixed. */ - error = -EEXIST; - if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW && - info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) + if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE + | NLM_F_EXCL))) { + error = -EEXIST; goto err_unlock_ovs; - + } /* The unmasked key has to be the same for flow updates. */ - error = -EINVAL; - if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) { - OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); - goto err_unlock_ovs; + if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } } - /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); - ovs_flow_deferred_free_acts(old_acts); - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW); + BUG_ON(error < 0); + } + ovs_unlock(); + + ovs_nla_free_flow_actions(old_acts); + ovs_flow_free(new_flow, false); + } + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + return 0; + +err_unlock_ovs: + ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + kfree(acts); +err_kfree_flow: + ovs_flow_free(new_flow, false); +error: + return error; +} + +static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key, masked_key; + struct sw_flow *flow; + struct sw_flow_mask mask; + struct sk_buff *reply = NULL; + struct datapath *dp; + struct sw_flow_actions *old_acts = NULL, *acts = NULL; + struct sw_flow_match match; + int error; + + /* Extract key. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) + goto error; + + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + if (error) + goto error; + + /* Validate actions. */ + if (a[OVS_FLOW_ATTR_ACTIONS]) { + acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); + error = PTR_ERR(acts); + if (IS_ERR(acts)) + goto error; + + ovs_flow_mask_key(&masked_key, &key, &mask); + error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], + &masked_key, 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); + goto err_kfree_acts; + } + } + + /* Can allocate before locking if have acts. */ + if (acts) { + reply = ovs_flow_cmd_alloc_info(acts, info, false); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + /* Check that the flow exists. */ + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { + error = -ENOENT; + goto err_unlock_ovs; + } + + /* Update actions, if present. */ + if (likely(acts)) { + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); - /* Clear stats. */ - if (a[OVS_FLOW_ATTR_CLEAR]) { - spin_lock_bh(&flow->lock); - clear_stats(flow); - spin_unlock_bh(&flow->lock); + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW); + BUG_ON(error < 0); + } + } else { + /* Could not alloc without acts before locking. */ + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, + info, OVS_FLOW_CMD_NEW, false); + if (unlikely(IS_ERR(reply))) { + error = PTR_ERR(reply); + goto err_unlock_ovs; } } + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) + ovs_flow_stats_clear(flow); ovs_unlock(); - if (!IS_ERR(reply)) - ovs_notify(reply, info, &ovs_dp_flow_multicast_group); - else - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, - ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + if (old_acts) + ovs_nla_free_flow_actions(old_acts); + return 0; -err_flow_free: - ovs_flow_free(flow, false); err_unlock_ovs: ovs_unlock(); -err_kfree: + kfree_skb(reply); +err_kfree_acts: kfree(acts); error: return error; @@ -1403,7 +1042,6 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; - struct flow_table *table; struct sw_flow_match match; int err; @@ -1413,7 +1051,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } ovs_match_init(&match, &key, NULL); - err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) return err; @@ -1424,15 +1062,14 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - table = ovsl_dereference(dp->table); - flow = ovs_flow_lookup_unmasked_key(table, &match); + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { err = -ENOENT; goto unlock; } - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, + OVS_FLOW_CMD_NEW, true); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; @@ -1453,50 +1090,56 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; - struct flow_table *table; struct sw_flow_match match; int err; + if (likely(a[OVS_FLOW_ATTR_KEY])) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); + if (unlikely(err)) + return err; + } + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { + if (unlikely(!dp)) { err = -ENODEV; goto unlock; } - if (!a[OVS_FLOW_ATTR_KEY]) { - err = flush_flows(dp); + if (unlikely(!a[OVS_FLOW_ATTR_KEY])) { + err = ovs_flow_tbl_flush(&dp->table); goto unlock; } - ovs_match_init(&match, &key, NULL); - err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); - if (err) - goto unlock; - - table = ovsl_dereference(dp->table); - flow = ovs_flow_lookup_unmasked_key(table, &match); - if (!flow) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { err = -ENOENT; goto unlock; } - reply = ovs_flow_cmd_alloc_info(flow); - if (!reply) { - err = -ENOMEM; - goto unlock; - } - - ovs_flow_remove(table, flow); + ovs_flow_tbl_remove(&dp->table, flow); + ovs_unlock(); - err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_FLOW_CMD_DEL); - BUG_ON(err < 0); + reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, + info, false); + if (likely(reply)) { + if (likely(!IS_ERR(reply))) { + rcu_read_lock(); /*To keep RCU checker happy. */ + err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_DEL); + rcu_read_unlock(); + BUG_ON(err < 0); + + ovs_notify(&dp_flow_genl_family, reply, info); + } else { + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + } + } ovs_flow_free(flow, true); - ovs_unlock(); - - ovs_notify(reply, info, &ovs_dp_flow_multicast_group); return 0; unlock: ovs_unlock(); @@ -1506,8 +1149,8 @@ unlock: static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct table_instance *ti; struct datapath *dp; - struct flow_table *table; rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -1516,18 +1159,18 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return -ENODEV; } - table = rcu_dereference(dp->table); + ti = rcu_dereference(dp->table.ti); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_dump_next(table, &bucket, &obj); + flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); if (!flow) break; - if (ovs_flow_cmd_fill_info(flow, dp, skb, + if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_FLOW_CMD_NEW) < 0) @@ -1540,11 +1183,17 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, +}; + static struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, - .doit = ovs_flow_cmd_new_or_set + .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -1560,27 +1209,22 @@ static struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_SET, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, - .doit = ovs_flow_cmd_new_or_set, + .doit = ovs_flow_cmd_set, }, }; -static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { - [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, -}; - -static struct genl_family dp_datapath_genl_family = { +static struct genl_family dp_flow_genl_family = { .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), - .name = OVS_DATAPATH_FAMILY, - .version = OVS_DATAPATH_VERSION, - .maxattr = OVS_DP_ATTR_MAX, + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX, .netnsok = true, .parallel_ops = true, -}; - -static struct genl_multicast_group ovs_dp_datapath_multicast_group = { - .name = OVS_DATAPATH_MCGROUP + .ops = dp_flow_genl_ops, + .n_ops = ARRAY_SIZE(dp_flow_genl_ops), + .mcgrps = &ovs_dp_flow_multicast_group, + .n_mcgrps = 1, }; static size_t ovs_dp_cmd_msg_size(void) @@ -1589,15 +1233,19 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ return msgsize; } +/* Called with ovs_mutex or RCU read lock. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; + struct ovs_dp_megaflow_stats dp_megaflow_stats; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, @@ -1607,14 +1255,21 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, ovs_header->dp_ifindex = get_dpifindex(dp); - rcu_read_lock(); err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); - rcu_read_unlock(); if (err) goto nla_put_failure; - get_dp_stats(dp, &dp_stats); - if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats)) + get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); + if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats)) + goto nla_put_failure; + + if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; return genlmsg_end(skb, ovs_header); @@ -1625,25 +1280,12 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid, - u32 seq, u8 cmd) +static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) { - struct sk_buff *skb; - int retval; - - skb = genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); - if (!skb) - return ERR_PTR(-ENOMEM); - - retval = ovs_dp_cmd_fill_info(dp, skb, portid, seq, 0, cmd); - if (retval < 0) { - kfree_skb(skb); - return ERR_PTR(retval); - } - return skb; + return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); } -/* Called with ovs_mutex. */ +/* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) @@ -1655,14 +1297,30 @@ static struct datapath *lookup_datapath(struct net *net, else { struct vport *vport; - rcu_read_lock(); vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; - rcu_read_unlock(); } return dp ? dp : ERR_PTR(-ENODEV); } +static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +{ + struct datapath *dp; + + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) + return; + + WARN(dp->user_features, "Dropping previously announced user features\n"); + dp->user_features = 0; +} + +static void ovs_dp_change(struct datapath *dp, struct nlattr **a) +{ + if (a[OVS_DP_ATTR_USER_FEATURES]) + dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); +} + static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1677,29 +1335,30 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - ovs_lock(); + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_unlock_ovs; + goto err_free_reply; ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); /* Allocate table. */ - err = -ENOMEM; - rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS)); - if (!dp->table) + err = ovs_flow_tbl_init(&dp->table); + if (err) goto err_free_dp; - dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); if (!dp->stats_percpu) { err = -ENOMEM; goto err_destroy_table; } dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), - GFP_KERNEL); + GFP_KERNEL); if (!dp->ports) { err = -ENOMEM; goto err_destroy_percpu; @@ -1716,42 +1375,53 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.port_no = OVSP_LOCAL; parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + ovs_dp_change(dp, a); + + /* So far only local changes have been made, now need the lock. */ + ovs_lock(); + vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); if (err == -EBUSY) err = -EEXIST; + if (err == -EEXIST) { + /* An outdated user space instance that does not understand + * the concept of user_features has attempted to create a new + * datapath and is likely to reuse it. Drop all user features. + */ + if (info->genlhdr->version < OVS_DP_VER_FEATURES) + ovs_dp_reset_user_features(skb, info); + } + goto err_destroy_ports_array; } - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto err_destroy_local_port; + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); list_add_tail_rcu(&dp->list_node, &ovs_net->dps); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_local_port: - ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); err_destroy_ports_array: + ovs_unlock(); kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false); + ovs_flow_tbl_destroy(&dp->table, false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); -err_unlock_ovs: - ovs_unlock(); +err_free_reply: + kfree_skb(reply); err: return err; } @@ -1773,10 +1443,13 @@ static void __dp_destroy(struct datapath *dp) list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that - * all port in datapath are destroyed first before freeing datapath. + * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + /* RCU destroy the flow table */ + ovs_flow_tbl_destroy(&dp->table, true); + call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1786,26 +1459,30 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - goto unlock; + goto err_unlock_free; - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_DEL); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto unlock; + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_DEL); + BUG_ON(err < 0); __dp_destroy(dp); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -unlock: + +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1815,28 +1492,30 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - goto unlock; + goto err_unlock_free; - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, - ovs_dp_datapath_multicast_group.id, err); - err = 0; - goto unlock; - } + ovs_dp_change(dp, info->attrs); + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -unlock: + +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1846,25 +1525,26 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - ovs_lock(); + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + rcu_read_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); - goto unlock; - } - - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - goto unlock; + goto err_unlock_free; } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + rcu_read_unlock(); - ovs_unlock(); return genlmsg_reply(reply, info); -unlock: - ovs_unlock(); +err_unlock_free: + rcu_read_unlock(); + kfree_skb(reply); return err; } @@ -1891,6 +1571,12 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, +}; + static struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -1915,27 +1601,18 @@ static struct genl_ops dp_datapath_genl_ops[] = { }, }; -static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { - [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, - [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, -}; - -static struct genl_family dp_vport_genl_family = { +static struct genl_family dp_datapath_genl_family = { .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), - .name = OVS_VPORT_FAMILY, - .version = OVS_VPORT_VERSION, - .maxattr = OVS_VPORT_ATTR_MAX, + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX, .netnsok = true, .parallel_ops = true, -}; - -struct genl_multicast_group ovs_dp_vport_multicast_group = { - .name = OVS_VPORT_MCGROUP + .ops = dp_datapath_genl_ops, + .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .mcgrps = &ovs_dp_datapath_multicast_group, + .n_mcgrps = 1, }; /* Called with ovs_mutex or RCU read lock. */ @@ -1977,7 +1654,12 @@ error: return err; } -/* Called with ovs_mutex or RCU read lock. */ +static struct sk_buff *ovs_vport_cmd_alloc_info(void) +{ + return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +} + +/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, u32 seq, u8 cmd) { @@ -2039,33 +1721,35 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) u32 port_no; int err; - err = -EINVAL; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) - goto exit; + return -EINVAL; + + port_no = a[OVS_VPORT_ATTR_PORT_NO] + ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; + if (port_no >= DP_MAX_PORTS) + return -EFBIG; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) - goto exit_unlock; - - if (a[OVS_VPORT_ATTR_PORT_NO]) { - port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); - - err = -EFBIG; - if (port_no >= DP_MAX_PORTS) - goto exit_unlock; + goto exit_unlock_free; + if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) - goto exit_unlock; + goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; - goto exit_unlock; + goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) @@ -2083,22 +1767,19 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) vport = new_vport(&parms); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; - err = 0; - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, - OVS_VPORT_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - ovs_dp_detach_port(vport); - goto exit_unlock; - } + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; -exit_unlock: +exit_unlock_free: ovs_unlock(); -exit: + kfree_skb(reply); return err; } @@ -2109,28 +1790,26 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; if (a[OVS_VPORT_ATTR_TYPE] && nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { err = -EINVAL; - goto exit_unlock; - } - - reply = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!reply) { - err = -ENOMEM; - goto exit_unlock; + goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_OPTIONS]) { err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) - goto exit_free; + goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_UPCALL_PID]) @@ -2141,13 +1820,12 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) BUG_ON(err < 0); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + ovs_notify(&dp_vport_genl_family, reply, info); return 0; -exit_free: - kfree_skb(reply); -exit_unlock: +exit_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -2158,30 +1836,33 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; if (vport->port_no == OVSP_LOCAL) { err = -EINVAL; - goto exit_unlock; + goto exit_unlock_free; } - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, - info->snd_seq, OVS_VPORT_CMD_DEL); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto exit_unlock; - - err = 0; + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_DEL); + BUG_ON(err < 0); ovs_dp_detach_port(vport); + ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; -exit_unlock: +exit_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -2193,24 +1874,25 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + rcu_read_lock(); vport = lookup_vport(sock_net(skb->sk), ovs_header, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; - - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, - info->snd_seq, OVS_VPORT_CMD_NEW); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto exit_unlock; - + goto exit_unlock_free; + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); rcu_read_unlock(); return genlmsg_reply(reply, info); -exit_unlock: +exit_unlock_free: rcu_read_unlock(); + kfree_skb(reply); return err; } @@ -2221,11 +1903,12 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int bucket = cb->args[0], skip = cb->args[1]; int i, j = 0; + rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) + if (!dp) { + rcu_read_unlock(); return -ENODEV; - - rcu_read_lock(); + } for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; @@ -2252,6 +1935,15 @@ out: return skb->len; } +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + static struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -2276,26 +1968,25 @@ static struct genl_ops dp_vport_genl_ops[] = { }, }; -struct genl_family_and_ops { - struct genl_family *family; - struct genl_ops *ops; - int n_ops; - struct genl_multicast_group *group; +struct genl_family dp_vport_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_vport_genl_ops, + .n_ops = ARRAY_SIZE(dp_vport_genl_ops), + .mcgrps = &ovs_dp_vport_multicast_group, + .n_mcgrps = 1, }; -static const struct genl_family_and_ops dp_genl_families[] = { - { &dp_datapath_genl_family, - dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), - &ovs_dp_datapath_multicast_group }, - { &dp_vport_genl_family, - dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops), - &ovs_dp_vport_multicast_group }, - { &dp_flow_genl_family, - dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops), - &ovs_dp_flow_multicast_group }, - { &dp_packet_genl_family, - dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), - NULL }, +static struct genl_family * const dp_genl_families[] = { + &dp_datapath_genl_family, + &dp_vport_genl_family, + &dp_flow_genl_family, + &dp_packet_genl_family, }; static void dp_unregister_genl(int n_families) @@ -2303,65 +1994,28 @@ static void dp_unregister_genl(int n_families) int i; for (i = 0; i < n_families; i++) - genl_unregister_family(dp_genl_families[i].family); + genl_unregister_family(dp_genl_families[i]); } static int dp_register_genl(void) { - int n_registered; int err; int i; - n_registered = 0; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { - const struct genl_family_and_ops *f = &dp_genl_families[i]; - err = genl_register_family_with_ops(f->family, f->ops, - f->n_ops); + err = genl_register_family(dp_genl_families[i]); if (err) goto error; - n_registered++; - - if (f->group) { - err = genl_register_mc_group(f->family, f->group); - if (err) - goto error; - } } return 0; error: - dp_unregister_genl(n_registered); + dp_unregister_genl(i); return err; } -static void rehash_flow_table(struct work_struct *work) -{ - struct datapath *dp; - struct net *net; - - ovs_lock(); - rtnl_lock(); - for_each_net(net) { - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - list_for_each_entry(dp, &ovs_net->dps, list_node) { - struct flow_table *old_table = ovsl_dereference(dp->table); - struct flow_table *new_table; - - new_table = ovs_flow_tbl_rehash(old_table); - if (!IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_destroy(old_table, true); - } - } - } - rtnl_unlock(); - ovs_unlock(); - schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); -} - static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); @@ -2419,8 +2073,6 @@ static int __init dp_init(void) if (err < 0) goto error_unreg_notifier; - schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); - return 0; error_unreg_notifier: @@ -2437,7 +2089,6 @@ error: static void dp_cleanup(void) { - cancel_delayed_work_sync(&rehash_flow_wq); dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4d109c176ef..7ede507500d 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -27,6 +27,7 @@ #include <linux/u64_stats_sync.h> #include "flow.h" +#include "flow_table.h" #include "vport.h" #define DP_MAX_PORTS USHRT_MAX @@ -45,19 +46,23 @@ * @n_lost: Number of received packets that had no matching flow in the flow * table that could not be sent to userspace (normally due to an overflow in * one of the datapath's queues). + * @n_mask_hit: Number of masks looked up for flow match. + * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked + * up per packet. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; - struct u64_stats_sync sync; + u64 n_mask_hit; + struct u64_stats_sync syncp; }; /** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. - * @table: Current flow table. Protected by ovs_mutex and RCU. + * @table: flow table. * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. @@ -71,7 +76,7 @@ struct datapath { struct list_head list_node; /* Flow table. */ - struct flow_table __rcu *table; + struct flow_table table; /* Switch ports. */ struct hlist_head *ports; @@ -83,6 +88,8 @@ struct datapath { /* Network namespace ref. */ struct net *net; #endif + + u32 user_features; }; /** @@ -140,6 +147,8 @@ int lockdep_ovsl_is_held(void); #define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held())) #define ovsl_dereference(p) \ rcu_dereference_protected(p, lockdep_ovsl_is_held()) +#define rcu_dereference_ovsl(p) \ + rcu_dereference_check(p, lockdep_ovsl_is_held()) static inline struct net *ovs_dp_get_net(struct datapath *dp) { @@ -172,21 +181,22 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n } extern struct notifier_block ovs_dp_device_notifier; -extern struct genl_multicast_group ovs_dp_vport_multicast_group; +extern struct genl_family dp_vport_genl_family; void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); -const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); -#define OVS_NLERR(fmt, ...) \ - pr_info_once("netlink: " fmt, ##__VA_ARGS__) - +#define OVS_NLERR(fmt, ...) \ +do { \ + if (net_ratelimit()) \ + pr_info("netlink: " fmt, ##__VA_ARGS__); \ +} while (0) #endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index c3235675f35..2c631fe76be 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -34,15 +34,14 @@ static void dp_detach_port_notify(struct vport *vport) OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { - netlink_set_err(ovs_dp_get_net(dp)->genl_sock, 0, - ovs_dp_vport_multicast_group.id, - PTR_ERR(notify)); + genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, + 0, PTR_ERR(notify)); return; } - genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0, - ovs_dp_vport_multicast_group.id, - GFP_KERNEL); + genlmsg_multicast_netns(&dp_vport_genl_family, + ovs_dp_get_net(dp), notify, 0, + 0, GFP_KERNEL); } void ovs_dp_notify_wq(struct work_struct *work) @@ -65,8 +64,7 @@ void ovs_dp_notify_wq(struct work_struct *work) continue; netdev_vport = netdev_vport_priv(vport); - if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED || - netdev_vport->dev->reg_state == NETREG_UNREGISTERING) + if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } @@ -88,6 +86,10 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { + /* upper_dev_unlink and decrement promisc immediately */ + ovs_netdev_detach_dev(vport); + + /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); queue_work(system_wq, &ovs_net->dp_notify_work); } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 410db90db73..d07ab538fc9 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -35,6 +35,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/sctp.h> +#include <linux/smp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> @@ -45,202 +46,128 @@ #include <net/ipv6.h> #include <net/ndisc.h> -static struct kmem_cache *flow_cache; - -static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, - struct sw_flow_key_range *range, u8 val); - -static void update_range__(struct sw_flow_match *match, - size_t offset, size_t size, bool is_mask) +u64 ovs_flow_used_time(unsigned long flow_jiffies) { - struct sw_flow_key_range *range = NULL; - size_t start = rounddown(offset, sizeof(long)); - size_t end = roundup(offset + size, sizeof(long)); - - if (!is_mask) - range = &match->range; - else if (match->mask) - range = &match->mask->range; - - if (!range) - return; - - if (range->start == range->end) { - range->start = start; - range->end = end; - return; - } + struct timespec cur_ts; + u64 cur_ms, idle_ms; - if (range->start > start) - range->start = start; + ktime_get_ts(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; - if (range->end < end) - range->end = end; + return cur_ms - idle_ms; } -#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - (match)->mask->key.field = value; \ - } else { \ - (match)->key->field = value; \ - } \ - } while (0) - -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ - } while (0) - -static u16 range_n_bytes(const struct sw_flow_key_range *range) -{ - return range->end - range->start; -} +#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) -void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, - struct sw_flow_mask *mask) +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + struct sk_buff *skb) { - memset(match, 0, sizeof(*match)); - match->key = key; - match->mask = mask; + struct flow_stats *stats; + int node = numa_node_id(); - memset(key, 0, sizeof(*key)); + stats = rcu_dereference(flow->stats[node]); - if (mask) { - memset(&mask->key, 0, sizeof(mask->key)); - mask->range.start = mask->range.end = 0; - } -} - -static bool ovs_match_validate(const struct sw_flow_match *match, - u64 key_attrs, u64 mask_attrs) -{ - u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; - u64 mask_allowed = key_attrs; /* At most allow all key attributes */ - - /* The following mask attributes allowed only if they - * pass the validation tests. */ - mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) - | (1 << OVS_KEY_ATTR_IPV6) - | (1 << OVS_KEY_ATTR_TCP) - | (1 << OVS_KEY_ATTR_UDP) - | (1 << OVS_KEY_ATTR_SCTP) - | (1 << OVS_KEY_ATTR_ICMP) - | (1 << OVS_KEY_ATTR_ICMPV6) - | (1 << OVS_KEY_ATTR_ARP) - | (1 << OVS_KEY_ATTR_ND)); - - /* Always allowed mask fields. */ - mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) - | (1 << OVS_KEY_ATTR_IN_PORT) - | (1 << OVS_KEY_ATTR_ETHERTYPE)); - - /* Check key attributes. */ - if (match->key->eth.type == htons(ETH_P_ARP) - || match->key->eth.type == htons(ETH_P_RARP)) { - key_expected |= 1 << OVS_KEY_ATTR_ARP; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_ARP; - } - - if (match->key->eth.type == htons(ETH_P_IP)) { - key_expected |= 1 << OVS_KEY_ATTR_IPV4; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; - - if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { - if (match->key->ip.proto == IPPROTO_UDP) { - key_expected |= 1 << OVS_KEY_ATTR_UDP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_UDP; - } - - if (match->key->ip.proto == IPPROTO_SCTP) { - key_expected |= 1 << OVS_KEY_ATTR_SCTP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; - } - - if (match->key->ip.proto == IPPROTO_TCP) { - key_expected |= 1 << OVS_KEY_ATTR_TCP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_TCP; - } + /* Check if already have node-specific stats. */ + if (likely(stats)) { + spin_lock(&stats->lock); + /* Mark if we write on the pre-allocated stats. */ + if (node == 0 && unlikely(flow->stats_last_writer != node)) + flow->stats_last_writer = node; + } else { + stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ + spin_lock(&stats->lock); - if (match->key->ip.proto == IPPROTO_ICMP) { - key_expected |= 1 << OVS_KEY_ATTR_ICMP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + /* If the current NUMA-node is the only writer on the + * pre-allocated stats keep using them. + */ + if (unlikely(flow->stats_last_writer != node)) { + /* A previous locker may have already allocated the + * stats, so we need to check again. If node-specific + * stats were already allocated, we update the pre- + * allocated stats as we have already locked them. + */ + if (likely(flow->stats_last_writer != NUMA_NO_NODE) + && likely(!rcu_dereference(flow->stats[node]))) { + /* Try to allocate node-specific stats. */ + struct flow_stats *new_stats; + + new_stats = + kmem_cache_alloc_node(flow_stats_cache, + GFP_THISNODE | + __GFP_NOMEMALLOC, + node); + if (likely(new_stats)) { + new_stats->used = jiffies; + new_stats->packet_count = 1; + new_stats->byte_count = skb->len; + new_stats->tcp_flags = tcp_flags; + spin_lock_init(&new_stats->lock); + + rcu_assign_pointer(flow->stats[node], + new_stats); + goto unlock; + } } + flow->stats_last_writer = node; } } - if (match->key->eth.type == htons(ETH_P_IPV6)) { - key_expected |= 1 << OVS_KEY_ATTR_IPV6; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + stats->used = jiffies; + stats->packet_count++; + stats->byte_count += skb->len; + stats->tcp_flags |= tcp_flags; +unlock: + spin_unlock(&stats->lock); +} - if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { - if (match->key->ip.proto == IPPROTO_UDP) { - key_expected |= 1 << OVS_KEY_ATTR_UDP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_UDP; - } +/* Must be called with rcu_read_lock or ovs_mutex. */ +void ovs_flow_stats_get(const struct sw_flow *flow, + struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + int node; - if (match->key->ip.proto == IPPROTO_SCTP) { - key_expected |= 1 << OVS_KEY_ATTR_SCTP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; - } + *used = 0; + *tcp_flags = 0; + memset(ovs_stats, 0, sizeof(*ovs_stats)); - if (match->key->ip.proto == IPPROTO_TCP) { - key_expected |= 1 << OVS_KEY_ATTR_TCP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_TCP; - } + for_each_node(node) { + struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); - if (match->key->ip.proto == IPPROTO_ICMPV6) { - key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; - - if (match->key->ipv6.tp.src == - htons(NDISC_NEIGHBOUR_SOLICITATION) || - match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - key_expected |= 1 << OVS_KEY_ATTR_ND; - if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_ND; - } - } + if (stats) { + /* Local CPU may write on non-local stats, so we must + * block bottom-halves here. + */ + spin_lock_bh(&stats->lock); + if (!*used || time_after(stats->used, *used)) + *used = stats->used; + *tcp_flags |= stats->tcp_flags; + ovs_stats->n_packets += stats->packet_count; + ovs_stats->n_bytes += stats->byte_count; + spin_unlock_bh(&stats->lock); } } +} - if ((key_attrs & key_expected) != key_expected) { - /* Key attributes check failed. */ - OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", - key_attrs, key_expected); - return false; - } +/* Called with ovs_mutex. */ +void ovs_flow_stats_clear(struct sw_flow *flow) +{ + int node; - if ((mask_attrs & mask_allowed) != mask_attrs) { - /* Mask attributes check failed. */ - OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", - mask_attrs, mask_allowed); - return false; - } + for_each_node(node) { + struct flow_stats *stats = ovsl_dereference(flow->stats[node]); - return true; + if (stats) { + spin_lock_bh(&stats->lock); + stats->used = 0; + stats->packet_count = 0; + stats->byte_count = 0; + stats->tcp_flags = 0; + spin_unlock_bh(&stats->lock); + } + } } static int check_header(struct sk_buff *skb, int len) @@ -311,19 +238,6 @@ static bool icmphdr_ok(struct sk_buff *skb) sizeof(struct icmphdr)); } -u64 ovs_flow_used_time(unsigned long flow_jiffies) -{ - struct timespec cur_ts; - u64 cur_ms, idle_ms; - - ktime_get_ts(&cur_ts); - idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); - cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + - cur_ts.tv_nsec / NSEC_PER_MSEC; - - return cur_ms - idle_ms; -} - static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned int nh_ofs = skb_network_offset(skb); @@ -372,311 +286,6 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } -void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask) -{ - const long *m = (long *)((u8 *)&mask->key + mask->range.start); - const long *s = (long *)((u8 *)src + mask->range.start); - long *d = (long *)((u8 *)dst + mask->range.start); - int i; - - /* The memory outside of the 'mask->range' are not set since - * further operations on 'dst' only uses contents within - * 'mask->range'. - */ - for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) - *d++ = *s++ & *m++; -} - -#define TCP_FLAGS_OFFSET 13 -#define TCP_FLAG_MASK 0x3f - -void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) -{ - u8 tcp_flags = 0; - - if ((flow->key.eth.type == htons(ETH_P_IP) || - flow->key.eth.type == htons(ETH_P_IPV6)) && - flow->key.ip.proto == IPPROTO_TCP && - likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { - u8 *tcp = (u8 *)tcp_hdr(skb); - tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; - } - - spin_lock(&flow->lock); - flow->used = jiffies; - flow->packet_count++; - flow->byte_count += skb->len; - flow->tcp_flags |= tcp_flags; - spin_unlock(&flow->lock); -} - -struct sw_flow_actions *ovs_flow_actions_alloc(int size) -{ - struct sw_flow_actions *sfa; - - if (size > MAX_ACTIONS_BUFSIZE) - return ERR_PTR(-EINVAL); - - sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); - if (!sfa) - return ERR_PTR(-ENOMEM); - - sfa->actions_len = 0; - return sfa; -} - -struct sw_flow *ovs_flow_alloc(void) -{ - struct sw_flow *flow; - - flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); - if (!flow) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&flow->lock); - flow->sf_acts = NULL; - flow->mask = NULL; - - return flow; -} - -static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) -{ - hash = jhash_1word(hash, table->hash_seed); - return flex_array_get(table->buckets, - (hash & (table->n_buckets - 1))); -} - -static struct flex_array *alloc_buckets(unsigned int n_buckets) -{ - struct flex_array *buckets; - int i, err; - - buckets = flex_array_alloc(sizeof(struct hlist_head), - n_buckets, GFP_KERNEL); - if (!buckets) - return NULL; - - err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); - if (err) { - flex_array_free(buckets); - return NULL; - } - - for (i = 0; i < n_buckets; i++) - INIT_HLIST_HEAD((struct hlist_head *) - flex_array_get(buckets, i)); - - return buckets; -} - -static void free_buckets(struct flex_array *buckets) -{ - flex_array_free(buckets); -} - -static struct flow_table *__flow_tbl_alloc(int new_size) -{ - struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); - - if (!table) - return NULL; - - table->buckets = alloc_buckets(new_size); - - if (!table->buckets) { - kfree(table); - return NULL; - } - table->n_buckets = new_size; - table->count = 0; - table->node_ver = 0; - table->keep_flows = false; - get_random_bytes(&table->hash_seed, sizeof(u32)); - table->mask_list = NULL; - - return table; -} - -static void __flow_tbl_destroy(struct flow_table *table) -{ - int i; - - if (table->keep_flows) - goto skip_flows; - - for (i = 0; i < table->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = flex_array_get(table->buckets, i); - struct hlist_node *n; - int ver = table->node_ver; - - hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del(&flow->hash_node[ver]); - ovs_flow_free(flow, false); - } - } - - BUG_ON(!list_empty(table->mask_list)); - kfree(table->mask_list); - -skip_flows: - free_buckets(table->buckets); - kfree(table); -} - -struct flow_table *ovs_flow_tbl_alloc(int new_size) -{ - struct flow_table *table = __flow_tbl_alloc(new_size); - - if (!table) - return NULL; - - table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); - if (!table->mask_list) { - table->keep_flows = true; - __flow_tbl_destroy(table); - return NULL; - } - INIT_LIST_HEAD(table->mask_list); - - return table; -} - -static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) -{ - struct flow_table *table = container_of(rcu, struct flow_table, rcu); - - __flow_tbl_destroy(table); -} - -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) -{ - if (!table) - return; - - if (deferred) - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); - else - __flow_tbl_destroy(table); -} - -struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last) -{ - struct sw_flow *flow; - struct hlist_head *head; - int ver; - int i; - - ver = table->node_ver; - while (*bucket < table->n_buckets) { - i = 0; - head = flex_array_get(table->buckets, *bucket); - hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { - if (i < *last) { - i++; - continue; - } - *last = i + 1; - return flow; - } - (*bucket)++; - *last = 0; - } - - return NULL; -} - -static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) -{ - struct hlist_head *head; - - head = find_bucket(table, flow->hash); - hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); - - table->count++; -} - -static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new) -{ - int old_ver; - int i; - - old_ver = old->node_ver; - new->node_ver = !old_ver; - - /* Insert in new table. */ - for (i = 0; i < old->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head; - - head = flex_array_get(old->buckets, i); - - hlist_for_each_entry(flow, head, hash_node[old_ver]) - __tbl_insert(new, flow); - } - - new->mask_list = old->mask_list; - old->keep_flows = true; -} - -static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets) -{ - struct flow_table *new_table; - - new_table = __flow_tbl_alloc(n_buckets); - if (!new_table) - return ERR_PTR(-ENOMEM); - - flow_table_copy_flows(table, new_table); - - return new_table; -} - -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets); -} - -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets * 2); -} - -static void __flow_free(struct sw_flow *flow) -{ - kfree((struct sf_flow_acts __force *)flow->sf_acts); - kmem_cache_free(flow_cache, flow); -} - -static void rcu_free_flow_callback(struct rcu_head *rcu) -{ - struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - - __flow_free(flow); -} - -void ovs_flow_free(struct sw_flow *flow, bool deferred) -{ - if (!flow) - return; - - ovs_sw_flow_mask_del_ref(flow->mask, deferred); - - if (deferred) - call_rcu(&flow->rcu, rcu_free_flow_callback); - else - __flow_free(flow); -} - -/* Schedules 'sf_acts' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts) -{ - kfree_rcu(sf_acts, rcu); -} - static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) { struct qtag_prefix { @@ -745,8 +354,8 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, /* The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ - key->ipv6.tp.src = htons(icmp->icmp6_type); - key->ipv6.tp.dst = htons(icmp->icmp6_code); + key->tp.src = htons(icmp->icmp6_type); + key->tp.dst = htons(icmp->icmp6_code); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -785,14 +394,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, && opt_len == 8) { if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) goto invalid; - memcpy(key->ipv6.nd.sll, - &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + ether_addr_copy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)]); } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR && opt_len == 8) { if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) goto invalid; - memcpy(key->ipv6.nd.tll, - &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + ether_addr_copy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)]); } icmp_len -= opt_len; @@ -852,8 +461,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) * header in the linear data area. */ eth = eth_hdr(skb); - memcpy(key->eth.src, eth->h_source, ETH_ALEN); - memcpy(key->eth.dst, eth->h_dest, ETH_ALEN); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); __skb_pull(skb, 2 * ETH_ALEN); /* We are going to push all headers that we pull, so no need to @@ -908,20 +517,21 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) if (key->ip.proto == IPPROTO_TCP) { if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); - key->ipv4.tp.src = tcp->source; - key->ipv4.tp.dst = tcp->dest; + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); } } else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); - key->ipv4.tp.src = udp->source; - key->ipv4.tp.dst = udp->dest; + key->tp.src = udp->source; + key->tp.dst = udp->dest; } } else if (key->ip.proto == IPPROTO_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); - key->ipv4.tp.src = sctp->source; - key->ipv4.tp.dst = sctp->dest; + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { @@ -929,8 +539,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) /* The ICMP type and code fields use the 16-bit * transport port fields, so we need to store * them in 16-bit network byte order. */ - key->ipv4.tp.src = htons(icmp->type); - key->ipv4.tp.dst = htons(icmp->code); + key->tp.src = htons(icmp->type); + key->tp.dst = htons(icmp->code); } } @@ -950,8 +560,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->ip.proto = ntohs(arp->ar_op); memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); - memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); - memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); + ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); + ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ @@ -976,20 +586,21 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) if (key->ip.proto == NEXTHDR_TCP) { if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); - key->ipv6.tp.src = tcp->source; - key->ipv6.tp.dst = tcp->dest; + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); } } else if (key->ip.proto == NEXTHDR_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); - key->ipv6.tp.src = udp->source; - key->ipv6.tp.dst = udp->dest; + key->tp.src = udp->source; + key->tp.dst = udp->dest; } } else if (key->ip.proto == NEXTHDR_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); - key->ipv6.tp.src = sctp->source; - key->ipv6.tp.dst = sctp->dest; + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { @@ -1002,1080 +613,3 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) return 0; } - -static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, - int key_end) -{ - u32 *hash_key = (u32 *)((u8 *)key + key_start); - int hash_u32s = (key_end - key_start) >> 2; - - /* Make sure number of hash bytes are multiple of u32. */ - BUILD_BUG_ON(sizeof(long) % sizeof(u32)); - - return jhash2(hash_key, hash_u32s, 0); -} - -static int flow_key_start(const struct sw_flow_key *key) -{ - if (key->tun_key.ipv4_dst) - return 0; - else - return rounddown(offsetof(struct sw_flow_key, phy), - sizeof(long)); -} - -static bool __cmp_key(const struct sw_flow_key *key1, - const struct sw_flow_key *key2, int key_start, int key_end) -{ - const long *cp1 = (long *)((u8 *)key1 + key_start); - const long *cp2 = (long *)((u8 *)key2 + key_start); - long diffs = 0; - int i; - - for (i = key_start; i < key_end; i += sizeof(long)) - diffs |= *cp1++ ^ *cp2++; - - return diffs == 0; -} - -static bool __flow_cmp_masked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_start, int key_end) -{ - return __cmp_key(&flow->key, key, key_start, key_end); -} - -static bool __flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_start, int key_end) -{ - return __cmp_key(&flow->unmasked_key, key, key_start, key_end); -} - -bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_end) -{ - int key_start; - key_start = flow_key_start(key); - - return __flow_cmp_unmasked_key(flow, key, key_start, key_end); - -} - -struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, - struct sw_flow_match *match) -{ - struct sw_flow_key *unmasked = match->key; - int key_end = match->range.end; - struct sw_flow *flow; - - flow = ovs_flow_lookup(table, unmasked); - if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_end))) - flow = NULL; - - return flow; -} - -static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table, - const struct sw_flow_key *unmasked, - struct sw_flow_mask *mask) -{ - struct sw_flow *flow; - struct hlist_head *head; - int key_start = mask->range.start; - int key_end = mask->range.end; - u32 hash; - struct sw_flow_key masked_key; - - ovs_flow_key_mask(&masked_key, unmasked, mask); - hash = ovs_flow_hash(&masked_key, key_start, key_end); - head = find_bucket(table, hash); - hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { - if (flow->mask == mask && - __flow_cmp_masked_key(flow, &masked_key, - key_start, key_end)) - return flow; - } - return NULL; -} - -struct sw_flow *ovs_flow_lookup(struct flow_table *tbl, - const struct sw_flow_key *key) -{ - struct sw_flow *flow = NULL; - struct sw_flow_mask *mask; - - list_for_each_entry_rcu(mask, tbl->mask_list, list) { - flow = ovs_masked_flow_lookup(tbl, key, mask); - if (flow) /* Found */ - break; - } - - return flow; -} - - -void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow) -{ - flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start, - flow->mask->range.end); - __tbl_insert(table, flow); -} - -void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow) -{ - BUG_ON(table->count == 0); - hlist_del_rcu(&flow->hash_node[table->node_ver]); - table->count--; -} - -/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ -const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { - [OVS_KEY_ATTR_ENCAP] = -1, - [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), - [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), - [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), - [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), - [OVS_KEY_ATTR_VLAN] = sizeof(__be16), - [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), - [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), - [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), - [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), - [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), - [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), - [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), - [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), - [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), - [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), - [OVS_KEY_ATTR_TUNNEL] = -1, -}; - -static bool is_all_zero(const u8 *fp, size_t size) -{ - int i; - - if (!fp) - return false; - - for (i = 0; i < size; i++) - if (fp[i]) - return false; - - return true; -} - -static int __parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], - u64 *attrsp, bool nz) -{ - const struct nlattr *nla; - u32 attrs; - int rem; - - attrs = *attrsp; - nla_for_each_nested(nla, attr, rem) { - u16 type = nla_type(nla); - int expected_len; - - if (type > OVS_KEY_ATTR_MAX) { - OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", - type, OVS_KEY_ATTR_MAX); - return -EINVAL; - } - - if (attrs & (1 << type)) { - OVS_NLERR("Duplicate key attribute (type %d).\n", type); - return -EINVAL; - } - - expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) { - OVS_NLERR("Key attribute has unexpected length (type=%d" - ", length=%d, expected=%d).\n", type, - nla_len(nla), expected_len); - return -EINVAL; - } - - if (!nz || !is_all_zero(nla_data(nla), expected_len)) { - attrs |= 1 << type; - a[type] = nla; - } - } - if (rem) { - OVS_NLERR("Message has %d unknown bytes.\n", rem); - return -EINVAL; - } - - *attrsp = attrs; - return 0; -} - -static int parse_flow_mask_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) -{ - return __parse_flow_nlattrs(attr, a, attrsp, true); -} - -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) -{ - return __parse_flow_nlattrs(attr, a, attrsp, false); -} - -int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask) -{ - struct nlattr *a; - int rem; - bool ttl = false; - __be16 tun_flags = 0; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { - [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), - [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_TOS] = 1, - [OVS_TUNNEL_KEY_ATTR_TTL] = 1, - [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, - [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, - }; - - if (type > OVS_TUNNEL_KEY_ATTR_MAX) { - OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", - type, OVS_TUNNEL_KEY_ATTR_MAX); - return -EINVAL; - } - - if (ovs_tunnel_key_lens[type] != nla_len(a)) { - OVS_NLERR("IPv4 tunnel attribute type has unexpected " - " length (type=%d, length=%d, expected=%d).\n", - type, nla_len(a), ovs_tunnel_key_lens[type]); - return -EINVAL; - } - - switch (type) { - case OVS_TUNNEL_KEY_ATTR_ID: - SW_FLOW_KEY_PUT(match, tun_key.tun_id, - nla_get_be64(a), is_mask); - tun_flags |= TUNNEL_KEY; - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, - nla_get_be32(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, - nla_get_be32(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, - nla_get_u8(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, - nla_get_u8(a), is_mask); - ttl = true; - break; - case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_flags |= TUNNEL_DONT_FRAGMENT; - break; - case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_flags |= TUNNEL_CSUM; - break; - default: - return -EINVAL; - } - } - - SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); - - if (rem > 0) { - OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); - return -EINVAL; - } - - if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { - OVS_NLERR("IPv4 tunnel destination address is zero.\n"); - return -EINVAL; - } - - if (!ttl) { - OVS_NLERR("IPv4 tunnel TTL not specified.\n"); - return -EINVAL; - } - } - - return 0; -} - -int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) -{ - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - - if (output->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) - return -EMSGSIZE; - if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) - return -EMSGSIZE; - if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) - return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) - return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) - return -EMSGSIZE; - - nla_nest_end(skb, nla); - return 0; -} - -static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask) -{ - if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { - SW_FLOW_KEY_PUT(match, phy.priority, - nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); - } - - if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - - if (is_mask) - in_port = 0xffffffff; /* Always exact match in_port. */ - else if (in_port >= DP_MAX_PORTS) - return -EINVAL; - - SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); - } - - if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { - uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - - SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); - } - if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask)) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); - } - return 0; -} - -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask) -{ - int err; - u64 orig_attrs = attrs; - - err = metadata_from_nlattrs(match, &attrs, a, is_mask); - if (err) - return err; - - if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { - const struct ovs_key_ethernet *eth_key; - - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - SW_FLOW_KEY_MEMCPY(match, eth.src, - eth_key->eth_src, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, eth.dst, - eth_key->eth_dst, ETH_ALEN, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); - } - - if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { - __be16 tci; - - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - if (is_mask) - OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); - else - OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); - - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_VLAN); - } else if (!is_mask) - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); - - if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - __be16 eth_type; - - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (is_mask) { - /* Always exact match EtherType. */ - eth_type = htons(0xffff); - } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { - OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", - ntohs(eth_type), ETH_P_802_3_MIN); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); - } - - if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { - const struct ovs_key_ipv4 *ipv4_key; - - ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", - ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); - return -EINVAL; - } - SW_FLOW_KEY_PUT(match, ip.proto, - ipv4_key->ipv4_proto, is_mask); - SW_FLOW_KEY_PUT(match, ip.tos, - ipv4_key->ipv4_tos, is_mask); - SW_FLOW_KEY_PUT(match, ip.ttl, - ipv4_key->ipv4_ttl, is_mask); - SW_FLOW_KEY_PUT(match, ip.frag, - ipv4_key->ipv4_frag, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.src, - ipv4_key->ipv4_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.dst, - ipv4_key->ipv4_dst, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - } - - if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; - - ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", - ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); - return -EINVAL; - } - SW_FLOW_KEY_PUT(match, ipv6.label, - ipv6_key->ipv6_label, is_mask); - SW_FLOW_KEY_PUT(match, ip.proto, - ipv6_key->ipv6_proto, is_mask); - SW_FLOW_KEY_PUT(match, ip.tos, - ipv6_key->ipv6_tclass, is_mask); - SW_FLOW_KEY_PUT(match, ip.ttl, - ipv6_key->ipv6_hlimit, is_mask); - SW_FLOW_KEY_PUT(match, ip.frag, - ipv6_key->ipv6_frag, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, - ipv6_key->ipv6_src, - sizeof(match->key->ipv6.addr.src), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, - ipv6_key->ipv6_dst, - sizeof(match->key->ipv6.addr.dst), - is_mask); - - attrs &= ~(1 << OVS_KEY_ATTR_IPV6); - } - - if (attrs & (1 << OVS_KEY_ATTR_ARP)) { - const struct ovs_key_arp *arp_key; - - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - if (!is_mask && (arp_key->arp_op & htons(0xff00))) { - OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", - arp_key->arp_op); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, ipv4.addr.src, - arp_key->arp_sip, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.dst, - arp_key->arp_tip, is_mask); - SW_FLOW_KEY_PUT(match, ip.proto, - ntohs(arp_key->arp_op), is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, - arp_key->arp_sha, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, - arp_key->arp_tha, ETH_ALEN, is_mask); - - attrs &= ~(1 << OVS_KEY_ATTR_ARP); - } - - if (attrs & (1 << OVS_KEY_ATTR_TCP)) { - const struct ovs_key_tcp *tcp_key; - - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - tcp_key->tcp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - tcp_key->tcp_dst, is_mask); - } - attrs &= ~(1 << OVS_KEY_ATTR_TCP); - } - - if (attrs & (1 << OVS_KEY_ATTR_UDP)) { - const struct ovs_key_udp *udp_key; - - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - udp_key->udp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - udp_key->udp_dst, is_mask); - } - attrs &= ~(1 << OVS_KEY_ATTR_UDP); - } - - if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { - const struct ovs_key_sctp *sctp_key; - - sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - sctp_key->sctp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - sctp_key->sctp_dst, is_mask); - } - attrs &= ~(1 << OVS_KEY_ATTR_SCTP); - } - - if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { - const struct ovs_key_icmp *icmp_key; - - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - htons(icmp_key->icmp_type), is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - htons(icmp_key->icmp_code), is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ICMP); - } - - if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { - const struct ovs_key_icmpv6 *icmpv6_key; - - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - htons(icmpv6_key->icmpv6_type), is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - htons(icmpv6_key->icmpv6_code), is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); - } - - if (attrs & (1 << OVS_KEY_ATTR_ND)) { - const struct ovs_key_nd *nd_key; - - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, - nd_key->nd_target, - sizeof(match->key->ipv6.nd.target), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, - nd_key->nd_sll, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, - nd_key->nd_tll, ETH_ALEN, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ND); - } - - if (attrs != 0) - return -EINVAL; - - return 0; -} - -/** - * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and - * mask. In case the 'mask' is NULL, the flow is treated as exact match - * flow. Otherwise, it is treated as a wildcarded flow, except the mask - * does not include any don't care bit. - * @match: receives the extracted flow match information. - * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. The fields should of the packet that triggered the creation - * of this flow. - * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink - * attribute specifies the mask field of the wildcarded flow. - */ -int ovs_match_from_nlattrs(struct sw_flow_match *match, - const struct nlattr *key, - const struct nlattr *mask) -{ - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct nlattr *encap; - u64 key_attrs = 0; - u64 mask_attrs = 0; - bool encap_valid = false; - int err; - - err = parse_flow_nlattrs(key, a, &key_attrs); - if (err) - return err; - - if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && - (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && - (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { - __be16 tci; - - if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && - (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR("Invalid Vlan frame.\n"); - return -EINVAL; - } - - key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - encap = a[OVS_KEY_ATTR_ENCAP]; - key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - encap_valid = true; - - if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) { - OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); - return -EINVAL; - } - } else { - OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); - return -EINVAL; - } - } - - err = ovs_key_from_nlattrs(match, key_attrs, a, false); - if (err) - return err; - - if (mask) { - err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); - if (err) - return err; - - if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) { - __be16 eth_type = 0; - __be16 tci = 0; - - if (!encap_valid) { - OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); - return -EINVAL; - } - - mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - if (a[OVS_KEY_ATTR_ETHERTYPE]) - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (eth_type == htons(0xffff)) { - mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); - } else { - OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", - ntohs(eth_type)); - return -EINVAL; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); - return -EINVAL; - } - } - - err = ovs_key_from_nlattrs(match, mask_attrs, a, true); - if (err) - return err; - } else { - /* Populate exact match flow's key mask. */ - if (match->mask) - ovs_sw_flow_mask_set(match->mask, &match->range, 0xff); - } - - if (!ovs_match_validate(match, key_attrs, mask_attrs)) - return -EINVAL; - - return 0; -} - -/** - * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. - * @flow: Receives extracted in_port, priority, tun_key and skb_mark. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - * - * This parses a series of Netlink attributes that form a flow key, which must - * take the same form accepted by flow_from_nlattrs(), but only enough of it to - * get the metadata, that is, the parts of the flow key that cannot be - * extracted from the packet itself. - */ - -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, - const struct nlattr *attr) -{ - struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - u64 attrs = 0; - int err; - struct sw_flow_match match; - - flow->key.phy.in_port = DP_MAX_PORTS; - flow->key.phy.priority = 0; - flow->key.phy.skb_mark = 0; - memset(tun_key, 0, sizeof(flow->key.tun_key)); - - err = parse_flow_nlattrs(attr, a, &attrs); - if (err) - return -EINVAL; - - memset(&match, 0, sizeof(match)); - match.key = &flow->key; - - err = metadata_from_nlattrs(&match, &attrs, a, false); - if (err) - return err; - - return 0; -} - -int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, struct sk_buff *skb) -{ - struct ovs_key_ethernet *eth_key; - struct nlattr *nla, *encap; - bool is_mask = (swkey != output); - - if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) - goto nla_put_failure; - - if ((swkey->tun_key.ipv4_dst || is_mask) && - ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) - goto nla_put_failure; - - if (swkey->phy.in_port == DP_MAX_PORTS) { - if (is_mask && (output->phy.in_port == 0xffff)) - if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) - goto nla_put_failure; - } else { - u16 upper_u16; - upper_u16 = !is_mask ? 0 : 0xffff; - - if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, - (upper_u16 << 16) | output->phy.in_port)) - goto nla_put_failure; - } - - if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) - goto nla_put_failure; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); - if (!nla) - goto nla_put_failure; - - eth_key = nla_data(nla); - memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); - - if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - __be16 eth_type; - eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) - goto nla_put_failure; - encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.tci) - goto unencap; - } else - encap = NULL; - - if (swkey->eth.type == htons(ETH_P_802_2)) { - /* - * Ethertype 802.2 is represented in the netlink with omitted - * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and - * 0xffff in the mask attribute. Ethertype can also - * be wildcarded. - */ - if (is_mask && output->eth.type) - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, - output->eth.type)) - goto nla_put_failure; - goto unencap; - } - - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) - goto nla_put_failure; - - if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ipv4 *ipv4_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); - if (!nla) - goto nla_put_failure; - ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = output->ipv4.addr.src; - ipv4_key->ipv4_dst = output->ipv4.addr.dst; - ipv4_key->ipv4_proto = output->ip.proto; - ipv4_key->ipv4_tos = output->ip.tos; - ipv4_key->ipv4_ttl = output->ip.ttl; - ipv4_key->ipv4_frag = output->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ipv6 *ipv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); - if (!nla) - goto nla_put_failure; - ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, - sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, - sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = output->ipv6.label; - ipv6_key->ipv6_proto = output->ip.proto; - ipv6_key->ipv6_tclass = output->ip.tos; - ipv6_key->ipv6_hlimit = output->ip.ttl; - ipv6_key->ipv6_frag = output->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { - struct ovs_key_arp *arp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); - if (!nla) - goto nla_put_failure; - arp_key = nla_data(nla); - memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = output->ipv4.addr.src; - arp_key->arp_tip = output->ipv4.addr.dst; - arp_key->arp_op = htons(output->ip.proto); - memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); - } - - if ((swkey->eth.type == htons(ETH_P_IP) || - swkey->eth.type == htons(ETH_P_IPV6)) && - swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - - if (swkey->ip.proto == IPPROTO_TCP) { - struct ovs_key_tcp *tcp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); - if (!nla) - goto nla_put_failure; - tcp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = output->ipv4.tp.src; - tcp_key->tcp_dst = output->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = output->ipv6.tp.src; - tcp_key->tcp_dst = output->ipv6.tp.dst; - } - } else if (swkey->ip.proto == IPPROTO_UDP) { - struct ovs_key_udp *udp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); - if (!nla) - goto nla_put_failure; - udp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = output->ipv4.tp.src; - udp_key->udp_dst = output->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = output->ipv6.tp.src; - udp_key->udp_dst = output->ipv6.tp.dst; - } - } else if (swkey->ip.proto == IPPROTO_SCTP) { - struct ovs_key_sctp *sctp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); - if (!nla) - goto nla_put_failure; - sctp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - sctp_key->sctp_src = swkey->ipv4.tp.src; - sctp_key->sctp_dst = swkey->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - sctp_key->sctp_src = swkey->ipv6.tp.src; - sctp_key->sctp_dst = swkey->ipv6.tp.dst; - } - } else if (swkey->eth.type == htons(ETH_P_IP) && - swkey->ip.proto == IPPROTO_ICMP) { - struct ovs_key_icmp *icmp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); - if (!nla) - goto nla_put_failure; - icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(output->ipv4.tp.src); - icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); - } else if (swkey->eth.type == htons(ETH_P_IPV6) && - swkey->ip.proto == IPPROTO_ICMPV6) { - struct ovs_key_icmpv6 *icmpv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, - sizeof(*icmpv6_key)); - if (!nla) - goto nla_put_failure; - icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); - - if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { - struct ovs_key_nd *nd_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); - if (!nla) - goto nla_put_failure; - nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &output->ipv6.nd.target, - sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); - } - } - } - -unencap: - if (encap) - nla_nest_end(skb, encap); - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -/* Initializes the flow module. - * Returns zero if successful or a negative error code. */ -int ovs_flow_init(void) -{ - BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); - BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); - - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, - 0, NULL); - if (flow_cache == NULL) - return -ENOMEM; - - return 0; -} - -/* Uninitializes the flow module. */ -void ovs_flow_exit(void) -{ - kmem_cache_destroy(flow_cache); -} - -struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) -{ - struct sw_flow_mask *mask; - - mask = kmalloc(sizeof(*mask), GFP_KERNEL); - if (mask) - mask->ref_count = 0; - - return mask; -} - -void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) -{ - mask->ref_count++; -} - -void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) -{ - if (!mask) - return; - - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - if (deferred) - kfree_rcu(mask, rcu); - else - kfree(mask); - } -} - -static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a, - const struct sw_flow_mask *b) -{ - u8 *a_ = (u8 *)&a->key + a->range.start; - u8 *b_ = (u8 *)&b->key + b->range.start; - - return (a->range.end == b->range.end) - && (a->range.start == b->range.start) - && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); -} - -struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, - const struct sw_flow_mask *mask) -{ - struct list_head *ml; - - list_for_each(ml, tbl->mask_list) { - struct sw_flow_mask *m; - m = container_of(ml, struct sw_flow_mask, list); - if (ovs_sw_flow_mask_equal(mask, m)) - return m; - } - - return NULL; -} - -/** - * add a new mask into the mask list. - * The caller needs to make sure that 'mask' is not the same - * as any masks that are already on the list. - */ -void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - list_add_rcu(&mask->list, tbl->mask_list); -} - -/** - * Set 'range' fields in the mask to the value of 'val'. - */ -static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, - struct sw_flow_key_range *range, u8 val) -{ - u8 *m = (u8 *)&mask->key + range->start; - - mask->range = *range; - memset(m, val, range_n_bytes(range)); -} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 212fbf7510c..5e5aaed3a85 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -19,6 +19,7 @@ #ifndef FLOW_H #define FLOW_H 1 +#include <linux/cache.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/openvswitch.h> @@ -33,14 +34,6 @@ #include <net/inet_ecn.h> struct sk_buff; -struct sw_flow_mask; -struct flow_table; - -struct sw_flow_actions { - struct rcu_head rcu; - u32 actions_len; - struct nlattr actions[]; -}; /* Used to memset ovs_key_ipv4_tunnel padding. */ #define OVS_TUNNEL_KEY_SIZE \ @@ -54,7 +47,7 @@ struct ovs_key_ipv4_tunnel { __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; -}; +} __packed __aligned(4); /* Minimize padding. */ static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, const struct iphdr *iph, __be64 tun_id, @@ -78,7 +71,7 @@ struct sw_flow_key { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ - } phy; + } __packed phy; /* Safe when right after 'tun_key'. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -91,22 +84,21 @@ struct sw_flow_key { u8 ttl; /* IP TTL/hop limit. */ u8 frag; /* One of OVS_FRAG_TYPE_*. */ } ip; + struct { + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ + } tp; union { struct { struct { __be32 src; /* IP source address. */ __be32 dst; /* IP destination address. */ } addr; - union { - struct { - __be16 src; /* TCP/UDP/SCTP source port. */ - __be16 dst; /* TCP/UDP/SCTP destination port. */ - } tp; - struct { - u8 sha[ETH_ALEN]; /* ARP source hardware address. */ - u8 tha[ETH_ALEN]; /* ARP target hardware address. */ - } arp; - }; + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; } ipv4; struct { struct { @@ -115,10 +107,6 @@ struct sw_flow_key { } addr; __be32 label; /* IPv6 flow label. */ struct { - __be16 src; /* TCP/UDP/SCTP source port. */ - __be16 dst; /* TCP/UDP/SCTP destination port. */ - } tp; - struct { struct in6_addr target; /* ND target address. */ u8 sll[ETH_ALEN]; /* ND source link layer address. */ u8 tll[ETH_ALEN]; /* ND target link layer address. */ @@ -127,26 +115,17 @@ struct sw_flow_key { }; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ -struct sw_flow { - struct rcu_head rcu; - struct hlist_node hash_node[2]; - u32 hash; - - struct sw_flow_key key; - struct sw_flow_key unmasked_key; - struct sw_flow_mask *mask; - struct sw_flow_actions __rcu *sf_acts; - - spinlock_t lock; /* Lock for values below. */ - unsigned long used; /* Last used time (in jiffies). */ - u64 packet_count; /* Number of packets matched. */ - u64 byte_count; /* Number of bytes matched. */ - u8 tcp_flags; /* Union of seen TCP flags. */ +struct sw_flow_key_range { + unsigned short int start; + unsigned short int end; }; -struct sw_flow_key_range { - size_t start; - size_t end; +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; }; struct sw_flow_match { @@ -155,8 +134,37 @@ struct sw_flow_match { struct sw_flow_mask *mask; }; -void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, struct sw_flow_mask *mask); +struct sw_flow_actions { + struct rcu_head rcu; + u32 actions_len; + struct nlattr actions[]; +}; + +struct flow_stats { + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + unsigned long used; /* Last used time (in jiffies). */ + spinlock_t lock; /* Lock for atomic stats update. */ + __be16 tcp_flags; /* Union of seen TCP flags. */ +}; + +struct sw_flow { + struct rcu_head rcu; + struct hlist_node hash_node[2]; + u32 hash; + int stats_last_writer; /* NUMA-node id of the last writer on + * 'stats[0]'. + */ + struct sw_flow_key key; + struct sw_flow_key unmasked_key; + struct sw_flow_mask *mask; + struct sw_flow_actions __rcu *sf_acts; + struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one + * is allocated at flow creation time, + * the rest are allocated on demand + * while holding the 'stats[0].lock'. + */ +}; struct arp_eth_header { __be16 ar_hrd; /* format of hardware address */ @@ -172,88 +180,13 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; -int ovs_flow_init(void); -void ovs_flow_exit(void); - -struct sw_flow *ovs_flow_alloc(void); -void ovs_flow_deferred_free(struct sw_flow *); -void ovs_flow_free(struct sw_flow *, bool deferred); - -struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); -void ovs_flow_deferred_free_acts(struct sw_flow_actions *); - -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); -void ovs_flow_used(struct sw_flow *, struct sk_buff *); +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + struct sk_buff *); +void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, + unsigned long *used, __be16 *tcp_flags); +void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_to_nlattrs(const struct sw_flow_key *, - const struct sw_flow_key *, struct sk_buff *); -int ovs_match_from_nlattrs(struct sw_flow_match *match, - const struct nlattr *, - const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, - const struct nlattr *attr); - -#define MAX_ACTIONS_BUFSIZE (32 * 1024) -#define TBL_MIN_BUCKETS 1024 - -struct flow_table { - struct flex_array *buckets; - unsigned int count, n_buckets; - struct rcu_head rcu; - struct list_head *mask_list; - int node_ver; - u32 hash_seed; - bool keep_flows; -}; - -static inline int ovs_flow_tbl_count(struct flow_table *table) -{ - return table->count; -} - -static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) -{ - return (table->count > table->n_buckets); -} - -struct sw_flow *ovs_flow_lookup(struct flow_table *, - const struct sw_flow_key *); -struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, - struct sw_flow_match *match); - -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); -struct flow_table *ovs_flow_tbl_alloc(int new_size); -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); - -void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow); -void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow); - -struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx); -extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; -int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask); -int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output); -bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_end); - -struct sw_flow_mask { - int ref_count; - struct rcu_head rcu; - struct list_head list; - struct sw_flow_key_range range; - struct sw_flow_key key; -}; +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); -struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); -void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); -void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); -void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); -struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, - const struct sw_flow_mask *); -void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c new file mode 100644 index 00000000000..d757848da89 --- /dev/null +++ b/net/openvswitch/flow_netlink.c @@ -0,0 +1,1576 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +#include "flow_netlink.h" + +static void update_range__(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range = NULL; + size_t start = rounddown(offset, sizeof(long)); + size_t end = roundup(offset + size, sizeof(long)); + + if (!is_mask) + range = &match->range; + else if (match->mask) + range = &match->mask->range; + + if (!range) + return; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + (match)->mask->key.field = value; \ + } else { \ + (match)->key->field = value; \ + } \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + len, is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + memcpy(&(match)->mask->key.field, value_p, len);\ + } else { \ + memcpy(&(match)->key->field, value_p, len); \ + } \ + } while (0) + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +static bool match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_TCP_FLAGS) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.tp.src == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", + (unsigned long long)key_attrs, (unsigned long long)key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", + (unsigned long long)mask_attrs, (unsigned long long)mask_allowed); + return false; + } + + return true; +} + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = -1, + [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), + [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), + [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), + [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), + [OVS_KEY_ATTR_VLAN] = sizeof(__be16), + [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), + [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), + [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), + [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), + [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16), + [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), + [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), + [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), + [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), + [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_TUNNEL] = -1, +}; + +static bool is_all_zero(const u8 *fp, size_t size) +{ + int i; + + if (!fp) + return false; + + for (i = 0; i < size; i++) + if (fp[i]) + return false; + + return true; +} + +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool nz) +{ + const struct nlattr *nla; + u64 attrs; + int rem; + + attrs = *attrsp; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + type, OVS_KEY_ATTR_MAX); + return -EINVAL; + } + + if (attrs & (1 << type)) { + OVS_NLERR("Duplicate key attribute (type %d).\n", type); + return -EINVAL; + } + + expected_len = ovs_key_lens[type]; + if (nla_len(nla) != expected_len && expected_len != -1) { + OVS_NLERR("Key attribute has unexpected length (type=%d" + ", length=%d, expected=%d).\n", type, + nla_len(nla), expected_len); + return -EINVAL; + } + + if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + attrs |= 1 << type; + a[type] = nla; + } + } + if (rem) { + OVS_NLERR("Message has %d unknown bytes.\n", rem); + return -EINVAL; + } + + *attrsp = attrs; + return 0; +} + +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, false); +} + +static int ipv4_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask) +{ + struct nlattr *a; + int rem; + bool ttl = false; + __be16 tun_flags = 0; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_TOS] = 1, + [OVS_TUNNEL_KEY_ATTR_TTL] = 1, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, + [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + }; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", + type, OVS_TUNNEL_KEY_ATTR_MAX); + return -EINVAL; + } + + if (ovs_tunnel_key_lens[type] != nla_len(a)) { + OVS_NLERR("IPv4 tunnel attribute type has unexpected " + " length (type=%d, length=%d, expected=%d).\n", + type, nla_len(a), ovs_tunnel_key_lens[type]); + return -EINVAL; + } + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + nla_get_be32(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + nla_get_be32(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + nla_get_u8(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + nla_get_u8(a), is_mask); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + tun_flags |= TUNNEL_DONT_FRAGMENT; + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + tun_flags |= TUNNEL_CSUM; + break; + default: + return -EINVAL; + } + } + + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + + if (rem > 0) { + OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); + return -EINVAL; + } + + if (!is_mask) { + if (!match->key->tun_key.ipv4_dst) { + OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + return -EINVAL; + } + } + + return 0; +} + +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output) +{ + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) + return -EMSGSIZE; + if (output->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + return -EMSGSIZE; + if (output->ipv4_dst && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + return -EMSGSIZE; + if (output->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + + +static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, + const struct nlattr **a, bool is_mask) +{ + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + + if (is_mask) + in_port = 0xffffffff; /* Always exact match in_port. */ + else if (in_port >= DP_MAX_PORTS) + return -EINVAL; + + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); + } + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask)) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + return 0; +} + +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, + const struct nlattr **a, bool is_mask) +{ + int err; + u64 orig_attrs = attrs; + + err = metadata_from_nlattrs(match, &attrs, a, is_mask); + if (err) + return err; + + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } + + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { + __be16 tci; + + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + else + OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); + + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } else if (!is_mask) + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", + ntohs(eth_type), ETH_P_802_3_MIN); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { + const struct ovs_key_ipv4 *ipv4_key; + + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { + const struct ovs_key_arp *arp_key; + + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + arp_key->arp_op); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + } else { + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) + return -EINVAL; + + return 0; +} + +static void sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val) +{ + u8 *m = (u8 *)&mask->key + range->start; + + mask->range = *range; + memset(m, val, range_n_bytes(range)); +} + +/** + * ovs_nla_get_match - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + */ +int ovs_nla_get_match(struct sw_flow_match *match, + const struct nlattr *key, + const struct nlattr *mask) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(key, a, &key_attrs); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR("Invalid Vlan frame.\n"); + return -EINVAL; + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + return -EINVAL; + } + } else { + OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, key_attrs, a, false); + if (err) + return err; + + if (mask) { + err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + if (err) + return err; + + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + return -EINVAL; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + } else { + OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", + ntohs(eth_type)); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + if (err) + return err; + } else { + /* Populate exact match flow's key mask. */ + if (match->mask) + sw_flow_mask_set(match->mask, &match->range, 0xff); + } + + if (!match_validate(match, key_attrs, mask_attrs)) + return -EINVAL; + + return 0; +} + +/** + * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. + * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ + +int ovs_nla_get_flow_metadata(struct sw_flow *flow, + const struct nlattr *attr) +{ + struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; + int err; + struct sw_flow_match match; + + flow->key.phy.in_port = DP_MAX_PORTS; + flow->key.phy.priority = 0; + flow->key.phy.skb_mark = 0; + memset(tun_key, 0, sizeof(flow->key.tun_key)); + + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) + return -EINVAL; + + memset(&match, 0, sizeof(match)); + match.key = &flow->key; + + err = metadata_from_nlattrs(&match, &attrs, a, false); + if (err) + return err; + + return 0; +} + +int ovs_nla_put_flow(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla, *encap; + bool is_mask = (swkey != output); + + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) + goto nla_put_failure; + + if ((swkey->tun_key.ipv4_dst || is_mask) && + ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) + goto nla_put_failure; + + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; + + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) + goto nla_put_failure; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + + eth_key = nla_data(nla); + ether_addr_copy(eth_key->eth_src, output->eth.src); + ether_addr_copy(eth_key->eth_dst, output->eth.dst); + + if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + goto nla_put_failure; + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.tci) + goto unencap; + } else + encap = NULL; + + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; + goto unencap; + } + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) + goto nla_put_failure; + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); + ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + tcp_key->tcp_src = output->tp.src; + tcp_key->tcp_dst = output->tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->tp.flags)) + goto nla_put_failure; + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + udp_key->udp_src = output->tp.src; + udp_key->udp_dst = output->tp.dst; + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + sctp_key->sctp_src = output->tp.src; + sctp_key->sctp_dst = output->tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(output->tp.src); + icmp_key->icmp_code = ntohs(output->tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(output->tp.src); + icmpv6_key->icmpv6_code = ntohs(output->tp.dst); + + if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &output->ipv6.nd.target, + sizeof(nd_key->nd_target)); + ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); + ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); + } + } + } + +unencap: + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +#define MAX_ACTIONS_BUFSIZE (32 * 1024) + +struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) +{ + struct sw_flow_actions *sfa; + + if (size > MAX_ACTIONS_BUFSIZE) + return ERR_PTR(-EINVAL); + + sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = 0; + return sfa; +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + kfree_rcu(sf_acts, rcu); +} + +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, + int attr_len) +{ + + struct sw_flow_actions *acts; + int new_acts_size; + int req_size = NLA_ALIGN(attr_len); + int next_offset = offsetof(struct sw_flow_actions, actions) + + (*sfa)->actions_len; + + if (req_size <= (ksize(*sfa) - next_offset)) + goto out; + + new_acts_size = ksize(*sfa) * 2; + + if (new_acts_size > MAX_ACTIONS_BUFSIZE) { + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + return ERR_PTR(-EMSGSIZE); + new_acts_size = MAX_ACTIONS_BUFSIZE; + } + + acts = ovs_nla_alloc_flow_actions(new_acts_size); + if (IS_ERR(acts)) + return (void *)acts; + + memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); + acts->actions_len = (*sfa)->actions_len; + kfree(*sfa); + *sfa = acts; + +out: + (*sfa)->actions_len += req_size; + return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +{ + struct nlattr *a; + + a = reserve_sfa_size(sfa, nla_attr_size(len)); + if (IS_ERR(a)) + return PTR_ERR(a); + + a->nla_type = attrtype; + a->nla_len = nla_attr_size(len); + + if (data) + memcpy(nla_data(a), data, len); + memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + + return 0; +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, + int attrtype) +{ + int used = (*sfa)->actions_len; + int err; + + err = add_action(sfa, attrtype, NULL, 0); + if (err) + return err; + + return used; +} + +static inline void add_nested_action_end(struct sw_flow_actions *sfa, + int st_offset) +{ + struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + + st_offset); + + a->nla_len = sfa->actions_len - st_offset; +} + +static int validate_and_copy_sample(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem, start, err, st_acts; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy sample action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); + if (start < 0) + return start; + err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32)); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); + if (st_acts < 0) + return st_acts; + + err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; +} + +static int validate_tp_port(const struct sw_flow_key *flow_key) +{ + if ((flow_key->eth.type == htons(ETH_P_IP) || + flow_key->eth.type == htons(ETH_P_IPV6)) && + (flow_key->tp.src || flow_key->tp.dst)) + return 0; + + return -EINVAL; +} + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa) +{ + struct sw_flow_match match; + struct sw_flow_key key; + int err, start; + + ovs_match_init(&match, &key, NULL); + err = ipv4_tun_from_nlattr(nla_data(attr), &match, false); + if (err) + return err; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); + if (start < 0) + return start; + + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, + sizeof(match.key->tun_key)); + add_nested_action_end(*sfa, start); + + return err; +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, + bool *set_tun) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + + /* There can be only one key in a action */ + if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + if (key_type > OVS_KEY_ATTR_MAX || + (ovs_key_lens[key_type] != nla_len(ovs_key) && + ovs_key_lens[key_type] != -1)) + return -EINVAL; + + switch (key_type) { + const struct ovs_key_ipv4 *ipv4_key; + const struct ovs_key_ipv6 *ipv6_key; + int err; + + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_ETHERNET: + break; + + case OVS_KEY_ATTR_TUNNEL: + *set_tun = true; + err = validate_and_copy_set_tun(a, sfa); + if (err) + return err; + break; + + case OVS_KEY_ATTR_IPV4: + if (flow_key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_IPV6: + if (flow_key->eth.type != htons(ETH_P_IPV6)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_TCP: + if (flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + return validate_tp_port(flow_key); + + case OVS_KEY_ATTR_UDP: + if (flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + return validate_tp_port(flow_key); + + case OVS_KEY_ATTR_SCTP: + if (flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + return validate_tp_port(flow_key); + + default: + return -EINVAL; + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, + attr, userspace_policy); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static int copy_action(const struct nlattr *from, + struct sw_flow_actions **sfa) +{ + int totlen = NLA_ALIGN(from->nla_len); + struct nlattr *to; + + to = reserve_sfa_size(sfa, from->nla_len); + if (IS_ERR(to)) + return PTR_ERR(to); + + memcpy(to, from, totlen); + return 0; +} + +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, + struct sw_flow_actions **sfa) +{ + const struct nlattr *a; + int rem, err; + + if (depth >= SAMPLE_ACTION_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + bool skip_copy; + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + skip_copy = false; + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + + case OVS_ACTION_ATTR_POP_VLAN: + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + vlan = nla_data(a); + if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key, sfa, &skip_copy); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = validate_and_copy_sample(a, key, depth, sfa); + if (err) + return err; + skip_copy = true; + break; + + default: + return -EINVAL; + } + if (!skip_copy) { + err = copy_action(a, sfa); + if (err) + return err; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_sample; + + switch (type) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_SAMPLE_ATTR_ACTIONS: + st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!st_sample) + return -EMSGSIZE; + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_sample); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_IPV4_TUNNEL: + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), + nla_data(ovs_key)); + if (err) + return err; + nla_nest_end(skb, start); + break; + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + +int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) +{ + const struct nlattr *a; + int rem, err; + + nla_for_each_attr(a, attr, len, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample_action_to_attr(a, skb); + if (err) + return err; + break; + default: + if (nla_put(skb, type, nla_len(a), nla_data(a))) + return -EMSGSIZE; + break; + } + } + + return 0; +} diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h new file mode 100644 index 00000000000..440151045d3 --- /dev/null +++ b/net/openvswitch/flow_netlink.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + + +#ifndef FLOW_NETLINK_H +#define FLOW_NETLINK_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + +int ovs_nla_put_flow(const struct sw_flow_key *, + const struct sw_flow_key *, struct sk_buff *); +int ovs_nla_get_flow_metadata(struct sw_flow *flow, + const struct nlattr *attr); +int ovs_nla_get_match(struct sw_flow_match *match, + const struct nlattr *, + const struct nlattr *); + +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa); +int ovs_nla_put_actions(const struct nlattr *attr, + int len, struct sk_buff *skb); + +struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len); +void ovs_nla_free_flow_actions(struct sw_flow_actions *); + +#endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c new file mode 100644 index 00000000000..cf2d853646f --- /dev/null +++ b/net/openvswitch/flow_table.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +#define TBL_MIN_BUCKETS 1024 +#define REHASH_INTERVAL (10 * 60 * HZ) + +static struct kmem_cache *flow_cache; +struct kmem_cache *flow_stats_cache __read_mostly; + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask) +{ + const long *m = (const long *)((const u8 *)&mask->key + + mask->range.start); + const long *s = (const long *)((const u8 *)src + + mask->range.start); + long *d = (long *)((u8 *)dst + mask->range.start); + int i; + + /* The memory outside of the 'mask->range' are not set since + * further operations on 'dst' only uses contents within + * 'mask->range'. + */ + for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + *d++ = *s++ & *m++; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + struct flow_stats *stats; + int node; + + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + flow->sf_acts = NULL; + flow->mask = NULL; + flow->stats_last_writer = NUMA_NO_NODE; + + /* Initialize the default stat node. */ + stats = kmem_cache_alloc_node(flow_stats_cache, + GFP_KERNEL | __GFP_ZERO, 0); + if (!stats) + goto err; + + spin_lock_init(&stats->lock); + + RCU_INIT_POINTER(flow->stats[0], stats); + + for_each_node(node) + if (node != 0) + RCU_INIT_POINTER(flow->stats[node], NULL); + + return flow; +err: + kmem_cache_free(flow_cache, flow); + return ERR_PTR(-ENOMEM); +} + +int ovs_flow_tbl_count(struct flow_table *table) +{ + return table->count; +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void flow_free(struct sw_flow *flow) +{ + int node; + + kfree((struct sw_flow_actions __force *)flow->sf_acts); + for_each_node(node) + if (flow->stats[node]) + kmem_cache_free(flow_stats_cache, + (struct flow_stats __force *)flow->stats[node]); + kmem_cache_free(flow_cache, flow); +} + +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + flow_free(flow); +} + +void ovs_flow_free(struct sw_flow *flow, bool deferred) +{ + if (!flow) + return; + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + flow_free(flow); +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + + +static void __table_instance_destroy(struct table_instance *ti) +{ + free_buckets(ti->buckets); + kfree(ti); +} + +static struct table_instance *table_instance_alloc(int new_size) +{ + struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (!ti) + return NULL; + + ti->buckets = alloc_buckets(new_size); + + if (!ti->buckets) { + kfree(ti); + return NULL; + } + ti->n_buckets = new_size; + ti->node_ver = 0; + ti->keep_flows = false; + get_random_bytes(&ti->hash_seed, sizeof(u32)); + + return ti; +} + +int ovs_flow_tbl_init(struct flow_table *table) +{ + struct table_instance *ti; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); + + if (!ti) + return -ENOMEM; + + rcu_assign_pointer(table->ti, ti); + INIT_LIST_HEAD(&table->mask_list); + table->last_rehash = jiffies; + table->count = 0; + return 0; +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct table_instance *ti = container_of(rcu, struct table_instance, rcu); + + __table_instance_destroy(ti); +} + +static void table_instance_destroy(struct table_instance *ti, bool deferred) +{ + int i; + + if (!ti) + return; + + if (ti->keep_flows) + goto skip_flows; + + for (i = 0; i < ti->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(ti->buckets, i); + struct hlist_node *n; + int ver = ti->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_flow_free(flow, deferred); + } + } + +skip_flows: + if (deferred) + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + else + __table_instance_destroy(ti); +} + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +{ + struct table_instance *ti = ovsl_dereference(table->ti); + + table_instance_destroy(ti, deferred); +} + +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, + u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = ti->node_ver; + while (*bucket < ti->n_buckets) { + i = 0; + head = flex_array_get(ti->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) +{ + hash = jhash_1word(hash, ti->hash_seed); + return flex_array_get(ti->buckets, + (hash & (ti->n_buckets - 1))); +} + +static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->hash); + hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head); +} + +static void flow_table_copy_flows(struct table_instance *old, + struct table_instance *new) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + hlist_for_each_entry(flow, head, hash_node[old_ver]) + table_instance_insert(new, flow); + } + + old->keep_flows = true; +} + +static struct table_instance *table_instance_rehash(struct table_instance *ti, + int n_buckets) +{ + struct table_instance *new_ti; + + new_ti = table_instance_alloc(n_buckets); + if (!new_ti) + return NULL; + + flow_table_copy_flows(ti, new_ti); + + return new_ti; +} + +int ovs_flow_tbl_flush(struct flow_table *flow_table) +{ + struct table_instance *old_ti; + struct table_instance *new_ti; + + old_ti = ovsl_dereference(flow_table->ti); + new_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ti) + return -ENOMEM; + + rcu_assign_pointer(flow_table->ti, new_ti); + flow_table->last_rehash = jiffies; + flow_table->count = 0; + + table_instance_destroy(old_ti, true); + return 0; +} + +static u32 flow_hash(const struct sw_flow_key *key, int key_start, + int key_end) +{ + const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); + int hash_u32s = (key_end - key_start) >> 2; + + /* Make sure number of hash bytes are multiple of u32. */ + BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + + return arch_fast_hash2(hash_key, hash_u32s, 0); +} + +static int flow_key_start(const struct sw_flow_key *key) +{ + if (key->tun_key.ipv4_dst) + return 0; + else + return rounddown(offsetof(struct sw_flow_key, phy), + sizeof(long)); +} + +static bool cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, + int key_start, int key_end) +{ + const long *cp1 = (const long *)((const u8 *)key1 + key_start); + const long *cp2 = (const long *)((const u8 *)key2 + key_start); + long diffs = 0; + int i; + + for (i = key_start; i < key_end; i += sizeof(long)) + diffs |= *cp1++ ^ *cp2++; + + return diffs == 0; +} + +static bool flow_cmp_masked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, + int key_start, int key_end) +{ + return cmp_key(&flow->key, key, key_start, key_end); +} + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + struct sw_flow_match *match) +{ + struct sw_flow_key *key = match->key; + int key_start = flow_key_start(key); + int key_end = match->range.end; + + return cmp_key(&flow->unmasked_key, key, key_start, key_end); +} + +static struct sw_flow *masked_flow_lookup(struct table_instance *ti, + const struct sw_flow_key *unmasked, + struct sw_flow_mask *mask) +{ + struct sw_flow *flow; + struct hlist_head *head; + int key_start = mask->range.start; + int key_end = mask->range.end; + u32 hash; + struct sw_flow_key masked_key; + + ovs_flow_mask_key(&masked_key, unmasked, mask); + hash = flow_hash(&masked_key, key_start, key_end); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) { + if (flow->mask == mask && flow->hash == hash && + flow_cmp_masked_key(flow, &masked_key, + key_start, key_end)) + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 *n_mask_hit) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + *n_mask_hit = 0; + list_for_each_entry_rcu(mask, &tbl->mask_list, list) { + (*n_mask_hit)++; + flow = masked_flow_lookup(ti, key, mask); + if (flow) /* Found */ + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + u32 __always_unused n_mask_hit; + + return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); +} + +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + /* Always called under ovs-mutex. */ + list_for_each_entry(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, match->key, mask); + if (flow && ovs_flow_cmp_unmasked_key(flow, match)) /* Found */ + return flow; + } + return NULL; +} + +int ovs_flow_tbl_num_masks(const struct flow_table *table) +{ + struct sw_flow_mask *mask; + int num = 0; + + list_for_each_entry(mask, &table->mask_list, list) + num++; + + return num; +} + +static struct table_instance *table_instance_expand(struct table_instance *ti) +{ + return table_instance_rehash(ti, ti->n_buckets * 2); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + kfree_rcu(mask, rcu); + } + } +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti = ovsl_dereference(table->ti); + + BUG_ON(table->count == 0); + hlist_del_rcu(&flow->hash_node[ti->node_ver]); + table->count--; + + /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be + * accessible as long as the RCU read lock is held. + */ + flow_mask_remove(table, flow->mask); +} + +static struct sw_flow_mask *mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 1; + + return mask; +} + +static bool mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + const u8 *a_ = (const u8 *)&a->key + a->range.start; + const u8 *b_ = (const u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, &tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/* Add 'mask' into the mask list, if it is not already there. */ +static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, + struct sw_flow_mask *new) +{ + struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); + if (!mask) { + /* Allocate a new mask if none exsits. */ + mask = mask_alloc(); + if (!mask) + return -ENOMEM; + mask->key = new->key; + mask->range = new->range; + list_add_rcu(&mask->list, &tbl->mask_list); + } else { + BUG_ON(!mask->ref_count); + mask->ref_count++; + } + + flow->mask = mask; + return 0; +} + +/* Must be called with OVS mutex held. */ +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + struct sw_flow_mask *mask) +{ + struct table_instance *new_ti = NULL; + struct table_instance *ti; + int err; + + err = flow_mask_insert(table, flow, mask); + if (err) + return err; + + flow->hash = flow_hash(&flow->key, flow->mask->range.start, + flow->mask->range.end); + ti = ovsl_dereference(table->ti); + table_instance_insert(ti, flow); + table->count++; + + /* Expand table, if necessary, to make room. */ + if (table->count > ti->n_buckets) + new_ti = table_instance_expand(ti); + else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) + new_ti = table_instance_rehash(ti, ti->n_buckets); + + if (new_ti) { + rcu_assign_pointer(table->ti, new_ti); + table_instance_destroy(ti, true); + table->last_rehash = jiffies; + } + return 0; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); + BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + + (num_possible_nodes() + * sizeof(struct flow_stats *)), + 0, 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + flow_stats_cache + = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (flow_stats_cache == NULL) { + kmem_cache_destroy(flow_cache); + flow_cache = NULL; + return -ENOMEM; + } + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_stats_cache); + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h new file mode 100644 index 00000000000..5918bff7f3f --- /dev/null +++ b/net/openvswitch/flow_table.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +struct table_instance { + struct flex_array *buckets; + unsigned int n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + bool keep_flows; +}; + +struct flow_table { + struct table_instance __rcu *ti; + struct list_head mask_list; + unsigned long last_rehash; + unsigned int count; +}; + +extern struct kmem_cache *flow_stats_cache; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_free(struct sw_flow *, bool deferred); + +int ovs_flow_tbl_init(struct flow_table *); +int ovs_flow_tbl_count(struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +int ovs_flow_tbl_flush(struct flow_table *flow_table); + +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + struct sw_flow_mask *mask); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +int ovs_flow_tbl_num_masks(const struct flow_table *table); +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, + u32 *bucket, u32 *idx); +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, + const struct sw_flow_key *, + u32 *n_mask_hit); +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match); +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + struct sw_flow_match *match); + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask); +#endif /* flow_table.h */ diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index c99dea543d6..f49148a07da 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -24,8 +24,6 @@ #include <linux/if_tunnel.h> #include <linux/if_vlan.h> #include <linux/in.h> -#include <linux/if_vlan.h> -#include <linux/in.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <linux/jhash.h> @@ -112,6 +110,22 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_RCVD; } +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) +{ + struct ovs_net *ovs_net; + struct vport *vport; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + + if (unlikely(!vport)) + return PACKET_REJECT; + else + return PACKET_RCVD; +} + static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); @@ -174,9 +188,9 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->local_df = 1; + skb->ignore_df = 1; - return iptunnel_xmit(rt, skb, fl.saddr, + return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, false); @@ -188,6 +202,7 @@ error: static struct gre_cisco_protocol gre_protocol = { .handler = gre_rcv, + .err_handler = gre_err, .priority = 1, }; @@ -258,7 +273,7 @@ static void gre_tnl_destroy(struct vport *vport) ovs_net = net_generic(net, ovs_net_id); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL); + RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); ovs_vport_deferred_free(vport); gre_exit(); } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 98d3edbbc23..789af9280e7 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -130,11 +130,11 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev->destructor = internal_dev_destructor; - SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops); + netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->tx_queue_len = 0; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO; + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; netdev->vlan_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 09d93c13cfd..d21f77d875b 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -150,15 +150,25 @@ static void free_port_rcu(struct rcu_head *rcu) ovs_vport_free(vport_from_priv(netdev_vport)); } -static void netdev_destroy(struct vport *vport) +void ovs_netdev_detach_dev(struct vport *vport) { struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); + ASSERT_RTNL(); netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(netdev_vport->dev, + netdev_master_upper_dev_get(netdev_vport->dev)); dev_set_promiscuity(netdev_vport->dev, -1); +} + +static void netdev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + rtnl_lock(); + if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&netdev_vport->rcu, free_port_rcu); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index dd298b5c5cd..8df01c1127e 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -39,5 +39,6 @@ netdev_vport_priv(const struct vport *vport) } const char *ovs_netdev_get_name(const struct vport *); +void ovs_netdev_detach_dev(struct vport *); #endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index a481c03e286..0edbd95c60e 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -29,7 +29,6 @@ #include <net/ip.h> #include <net/udp.h> #include <net/ip_tunnels.h> -#include <net/udp.h> #include <net/rtnetlink.h> #include <net/route.h> #include <net/dsfield.h> @@ -123,7 +122,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) vxlan_port = vxlan_vport(vport); strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false); + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0); if (IS_ERR(vs)) { ovs_vport_free(vport); return (void *)vs; @@ -171,9 +170,9 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->local_df = 1; + skb->ignore_df = 1; - inet_get_local_port_range(&port_min, &port_max); + inet_get_local_port_range(net, &port_min, &port_max); src_port = vxlan_src_port(port_min, port_max, skb); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, @@ -181,7 +180,8 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); + htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8), + false); if (err < 0) ip_rt_put(rt); error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6f65dbe1381..42c0f4a0b78 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -33,6 +33,9 @@ #include "vport.h" #include "vport-internal_dev.h" +static void ovs_vport_record_error(struct vport *, + enum vport_err_type err_type); + /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the bottom of vport.h. */ static const struct vport_ops *vport_ops_list[] = { @@ -135,7 +138,7 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); - vport->percpu_stats = alloc_percpu(struct pcpu_tstats); + vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->percpu_stats) { kfree(vport); return ERR_PTR(-ENOMEM); @@ -267,16 +270,16 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) spin_unlock_bh(&vport->stats_lock); for_each_possible_cpu(i) { - const struct pcpu_tstats *percpu_stats; - struct pcpu_tstats local_stats; + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; unsigned int start; percpu_stats = per_cpu_ptr(vport->percpu_stats, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->syncp); + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); stats->rx_bytes += local_stats.rx_bytes; stats->rx_packets += local_stats.rx_packets; @@ -336,7 +339,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, struct ovs_key_ipv4_tunnel *tun_key) { - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); @@ -362,7 +365,7 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) int sent = vport->ops->send(vport, skb); if (likely(sent > 0)) { - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; stats = this_cpu_ptr(vport->percpu_stats); @@ -388,7 +391,8 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) * If using the vport generic stats layer indicate that an error of the given * type has occurred. */ -void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) +static void ovs_vport_record_error(struct vport *vport, + enum vport_err_type err_type) { spin_lock(&vport->stats_lock); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1a9fbcec6e1..8d721e62f38 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -87,7 +87,7 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_tstats __percpu *percpu_stats; + struct pcpu_sw_netstats __percpu *percpu_stats; spinlock_t stats_lock; struct vport_err_stats err_stats; @@ -172,7 +172,7 @@ void ovs_vport_deferred_free(struct vport *vport); */ static inline void *vport_priv(const struct vport *vport) { - return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); + return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); } /** @@ -185,14 +185,13 @@ static inline void *vport_priv(const struct vport *vport) * the result of a hash table lookup. @priv must point to the start of the * private data area. */ -static inline struct vport *vport_from_priv(const void *priv) +static inline struct vport *vport_from_priv(void *priv) { - return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); + return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } void ovs_vport_receive(struct vport *, struct sk_buff *, struct ovs_key_ipv4_tunnel *); -void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ |
