aboutsummaryrefslogtreecommitdiff
path: root/net/openvswitch
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/Kconfig28
-rw-r--r--net/openvswitch/Makefile12
-rw-r--r--net/openvswitch/actions.c183
-rw-r--r--net/openvswitch/datapath.c1919
-rw-r--r--net/openvswitch/datapath.h112
-rw-r--r--net/openvswitch/dp_notify.c84
-rw-r--r--net/openvswitch/flow.c1144
-rw-r--r--net/openvswitch/flow.h193
-rw-r--r--net/openvswitch/flow_netlink.c1576
-rw-r--r--net/openvswitch/flow_netlink.h60
-rw-r--r--net/openvswitch/flow_table.c647
-rw-r--r--net/openvswitch/flow_table.h86
-rw-r--r--net/openvswitch/vport-gre.c287
-rw-r--r--net/openvswitch/vport-internal_dev.c48
-rw-r--r--net/openvswitch/vport-internal_dev.h2
-rw-r--r--net/openvswitch/vport-netdev.c99
-rw-r--r--net/openvswitch/vport-netdev.h8
-rw-r--r--net/openvswitch/vport-vxlan.c204
-rw-r--r--net/openvswitch/vport.c144
-rw-r--r--net/openvswitch/vport.h67
20 files changed, 4803 insertions, 2100 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d9ea33c361b..6ecf491ad50 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -4,6 +4,7 @@
config OPENVSWITCH
tristate "Open vSwitch"
+ select LIBCRC32C
---help---
Open vSwitch is a multilayer Ethernet switch targeted at virtualized
environments. In addition to supporting a variety of features
@@ -26,3 +27,30 @@ config OPENVSWITCH
called openvswitch.
If unsure, say N.
+
+config OPENVSWITCH_GRE
+ bool "Open vSwitch GRE tunneling support"
+ depends on INET
+ depends on OPENVSWITCH
+ depends on NET_IPGRE_DEMUX && !(OPENVSWITCH=y && NET_IPGRE_DEMUX=m)
+ default y
+ ---help---
+ If you say Y here, then the Open vSwitch will be able create GRE
+ vport.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
+
+config OPENVSWITCH_VXLAN
+ bool "Open vSwitch VXLAN tunneling support"
+ depends on INET
+ depends on OPENVSWITCH
+ depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m)
+ default y
+ ---help---
+ If you say Y here, then the Open vSwitch will be able create vxlan vport.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 15e7384745c..3591cb5dae9 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -9,6 +9,16 @@ openvswitch-y := \
datapath.o \
dp_notify.o \
flow.o \
+ flow_netlink.o \
+ flow_table.o \
vport.o \
vport-internal_dev.o \
- vport-netdev.o \
+ vport-netdev.o
+
+ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
+openvswitch-y += vport-vxlan.o
+endif
+
+ifneq ($(CONFIG_OPENVSWITCH_GRE),)
+openvswitch-y += vport-gre.o
+endif
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 48badffaafc..e70d8b18e96 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira Networks.
+ * Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -22,14 +22,17 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
+#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/in6.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <net/ip.h>
+#include <net/ipv6.h>
#include <net/checksum.h>
#include <net/dsfield.h>
+#include <net/sctp/checksum.h>
#include "datapath.h"
#include "vport.h"
@@ -45,7 +48,7 @@ static int make_writable(struct sk_buff *skb, int write_len)
return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
-/* remove VLAN header from packet and update csum accrodingly. */
+/* remove VLAN header from packet and update csum accordingly. */
static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
{
struct vlan_hdr *vhdr;
@@ -57,7 +60,7 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_sub(skb->csum, csum_partial(skb->data
- + ETH_HLEN, VLAN_HLEN, 0));
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
*current_tci = vhdr->h_vlan_TCI;
@@ -97,7 +100,7 @@ static int pop_vlan(struct sk_buff *skb)
if (unlikely(err))
return err;
- __vlan_hwaccel_put_tag(skb, ntohs(tci));
+ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci));
return 0;
}
@@ -109,15 +112,15 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
/* push down current VLAN tag */
current_tag = vlan_tx_tag_get(skb);
- if (!__vlan_put_tag(skb, current_tag))
+ if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
return -ENOMEM;
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_add(skb->csum, csum_partial(skb->data
- + ETH_HLEN, VLAN_HLEN, 0));
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
}
- __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+ __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
return 0;
}
@@ -129,8 +132,12 @@ static int set_eth_addr(struct sk_buff *skb,
if (unlikely(err))
return err;
- memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN);
- memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
+
+ ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src);
+ ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst);
+
+ ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
return 0;
}
@@ -158,10 +165,57 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
}
csum_replace4(&nh->check, *addr, new_addr);
- skb->rxhash = 0;
+ skb_clear_hash(skb);
*addr = new_addr;
}
+static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
+ __be32 addr[4], const __be32 new_addr[4])
+{
+ int transport_len = skb->len - skb_transport_offset(skb);
+
+ if (l4_proto == IPPROTO_TCP) {
+ if (likely(transport_len >= sizeof(struct tcphdr)))
+ inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
+ addr, new_addr, 1);
+ } else if (l4_proto == IPPROTO_UDP) {
+ if (likely(transport_len >= sizeof(struct udphdr))) {
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace16(&uh->check, skb,
+ addr, new_addr, 1);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ }
+ }
+}
+
+static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
+ __be32 addr[4], const __be32 new_addr[4],
+ bool recalculate_csum)
+{
+ if (recalculate_csum)
+ update_ipv6_checksum(skb, l4_proto, addr, new_addr);
+
+ skb_clear_hash(skb);
+ memcpy(addr, new_addr, sizeof(__be32[4]));
+}
+
+static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc)
+{
+ nh->priority = tc >> 4;
+ nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4);
+}
+
+static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl)
+{
+ nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16;
+ nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8;
+ nh->flow_lbl[2] = fl & 0x000000FF;
+}
+
static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl)
{
csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
@@ -195,13 +249,54 @@ static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key)
return 0;
}
+static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key)
+{
+ struct ipv6hdr *nh;
+ int err;
+ __be32 *saddr;
+ __be32 *daddr;
+
+ err = make_writable(skb, skb_network_offset(skb) +
+ sizeof(struct ipv6hdr));
+ if (unlikely(err))
+ return err;
+
+ nh = ipv6_hdr(skb);
+ saddr = (__be32 *)&nh->saddr;
+ daddr = (__be32 *)&nh->daddr;
+
+ if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src)))
+ set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr,
+ ipv6_key->ipv6_src, true);
+
+ if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) {
+ unsigned int offset = 0;
+ int flags = IP6_FH_F_SKIP_RH;
+ bool recalc_csum = true;
+
+ if (ipv6_ext_hdr(nh->nexthdr))
+ recalc_csum = ipv6_find_hdr(skb, &offset,
+ NEXTHDR_ROUTING, NULL,
+ &flags) != NEXTHDR_ROUTING;
+
+ set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr,
+ ipv6_key->ipv6_dst, recalc_csum);
+ }
+
+ set_ipv6_tc(nh, ipv6_key->ipv6_tclass);
+ set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label));
+ nh->hop_limit = ipv6_key->ipv6_hlimit;
+
+ return 0;
+}
+
/* Must follow make_writable() since that can move the skb data. */
static void set_tp_port(struct sk_buff *skb, __be16 *port,
__be16 new_port, __sum16 *check)
{
inet_proto_csum_replace2(check, skb, *port, new_port, 0);
*port = new_port;
- skb->rxhash = 0;
+ skb_clear_hash(skb);
}
static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port)
@@ -215,7 +310,7 @@ static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port)
uh->check = CSUM_MANGLED_0;
} else {
*port = new_port;
- skb->rxhash = 0;
+ skb_clear_hash(skb);
}
}
@@ -259,6 +354,39 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key)
return 0;
}
+static int set_sctp(struct sk_buff *skb,
+ const struct ovs_key_sctp *sctp_port_key)
+{
+ struct sctphdr *sh;
+ int err;
+ unsigned int sctphoff = skb_transport_offset(skb);
+
+ err = make_writable(skb, sctphoff + sizeof(struct sctphdr));
+ if (unlikely(err))
+ return err;
+
+ sh = sctp_hdr(skb);
+ if (sctp_port_key->sctp_src != sh->source ||
+ sctp_port_key->sctp_dst != sh->dest) {
+ __le32 old_correct_csum, new_csum, old_csum;
+
+ old_csum = sh->checksum;
+ old_correct_csum = sctp_compute_cksum(skb, sctphoff);
+
+ sh->source = sctp_port_key->sctp_src;
+ sh->dest = sctp_port_key->sctp_dst;
+
+ new_csum = sctp_compute_cksum(skb, sctphoff);
+
+ /* Carry any checksum errors through. */
+ sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
+
+ skb_clear_hash(skb);
+ }
+
+ return 0;
+}
+
static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
{
struct vport *vport;
@@ -266,7 +394,7 @@ static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
if (unlikely(!skb))
return -ENOMEM;
- vport = rcu_dereference(dp->ports[out_port]);
+ vport = ovs_vport_rcu(dp, out_port);
if (unlikely(!vport)) {
kfree_skb(skb);
return -ENODEV;
@@ -283,10 +411,12 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *a;
int rem;
+ BUG_ON(!OVS_CB(skb)->pkt_key);
+
upcall.cmd = OVS_PACKET_CMD_ACTION;
- upcall.key = &OVS_CB(skb)->flow->key;
+ upcall.key = OVS_CB(skb)->pkt_key;
upcall.userdata = NULL;
- upcall.pid = 0;
+ upcall.portid = 0;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
@@ -296,7 +426,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
break;
case OVS_USERSPACE_ATTR_PID:
- upcall.pid = nla_get_u32(a);
+ upcall.portid = nla_get_u32(a);
break;
}
}
@@ -315,7 +445,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
a = nla_next(a, &rem)) {
switch (nla_type(a)) {
case OVS_SAMPLE_ATTR_PROBABILITY:
- if (net_random() >= nla_get_u32(a))
+ if (prandom_u32() >= nla_get_u32(a))
return 0;
break;
@@ -339,6 +469,14 @@ static int execute_set_action(struct sk_buff *skb,
skb->priority = nla_get_u32(nested_attr);
break;
+ case OVS_KEY_ATTR_SKB_MARK:
+ skb->mark = nla_get_u32(nested_attr);
+ break;
+
+ case OVS_KEY_ATTR_IPV4_TUNNEL:
+ OVS_CB(skb)->tun_key = nla_data(nested_attr);
+ break;
+
case OVS_KEY_ATTR_ETHERNET:
err = set_eth_addr(skb, nla_data(nested_attr));
break;
@@ -347,6 +485,10 @@ static int execute_set_action(struct sk_buff *skb,
err = set_ipv4(skb, nla_data(nested_attr));
break;
+ case OVS_KEY_ATTR_IPV6:
+ err = set_ipv6(skb, nla_data(nested_attr));
+ break;
+
case OVS_KEY_ATTR_TCP:
err = set_tcp(skb, nla_data(nested_attr));
break;
@@ -354,6 +496,10 @@ static int execute_set_action(struct sk_buff *skb,
case OVS_KEY_ATTR_UDP:
err = set_udp(skb, nla_data(nested_attr));
break;
+
+ case OVS_KEY_ATTR_SCTP:
+ err = set_sctp(skb, nla_data(nested_attr));
+ break;
}
return err;
@@ -405,6 +551,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
case OVS_ACTION_ATTR_SAMPLE:
err = sample(dp, skb, a);
+ if (unlikely(err)) /* skb already freed. */
+ return err;
break;
}
@@ -430,6 +578,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
{
struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
+ OVS_CB(skb)->tun_key = NULL;
return do_execute_actions(dp, skb, acts->actions,
acts->actions_len, false);
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2c030505b33..9db4bf6740d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira Networks.
+ * Copyright (c) 2007-2014 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -38,7 +38,6 @@
#include <linux/udp.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
-#include <asm/system.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
@@ -48,53 +47,106 @@
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
-#include <linux/workqueue.h>
+#include <linux/genetlink.h>
+#include <net/genetlink.h>
#include <net/genetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include "datapath.h"
#include "flow.h"
+#include "flow_table.h"
+#include "flow_netlink.h"
#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+int ovs_net_id __read_mostly;
+
+static struct genl_family dp_packet_genl_family;
+static struct genl_family dp_flow_genl_family;
+static struct genl_family dp_datapath_genl_family;
+
+static struct genl_multicast_group ovs_dp_flow_multicast_group = {
+ .name = OVS_FLOW_MCGROUP
+};
+
+static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
+ .name = OVS_DATAPATH_MCGROUP
+};
+
+struct genl_multicast_group ovs_dp_vport_multicast_group = {
+ .name = OVS_VPORT_MCGROUP
+};
+
+/* Check if need to build a reply message.
+ * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
+static bool ovs_must_notify(struct genl_info *info,
+ const struct genl_multicast_group *grp)
+{
+ return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
+ netlink_has_listeners(genl_info_net(info)->genl_sock, 0);
+}
+
+static void ovs_notify(struct genl_family *family,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ genl_notify(family, skb, genl_info_net(info), info->snd_portid,
+ 0, info->nlhdr, GFP_KERNEL);
+}
/**
* DOC: Locking:
*
- * Writes to device state (add/remove datapath, port, set operations on vports,
- * etc.) are protected by RTNL.
- *
- * Writes to other state (flow table modifications, set miscellaneous datapath
- * parameters, etc.) are protected by genl_mutex. The RTNL lock nests inside
- * genl_mutex.
+ * All writes e.g. Writes to device state (add/remove datapath, port, set
+ * operations on vports, etc.), Writes to other state (flow table
+ * modifications, set miscellaneous datapath parameters, etc.) are protected
+ * by ovs_lock.
*
* Reads are protected by RCU.
*
* There are a few special cases (mostly stats) that have their own
* synchronization but they nest under all of above and don't interact with
* each other.
+ *
+ * The RTNL lock nests inside ovs_mutex.
*/
-/* Global list of datapaths to enable dumping them all out.
- * Protected by genl_mutex.
- */
-static LIST_HEAD(dps);
+static DEFINE_MUTEX(ovs_mutex);
-#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
-static void rehash_flow_table(struct work_struct *work);
-static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
+void ovs_lock(void)
+{
+ mutex_lock(&ovs_mutex);
+}
+
+void ovs_unlock(void)
+{
+ mutex_unlock(&ovs_mutex);
+}
+
+#ifdef CONFIG_LOCKDEP
+int lockdep_ovsl_is_held(void)
+{
+ if (debug_locks)
+ return lockdep_is_held(&ovs_mutex);
+ else
+ return 1;
+}
+#endif
static struct vport *new_vport(const struct vport_parms *);
-static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
+static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
const struct dp_upcall_info *);
-static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
+static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
const struct dp_upcall_info *);
-/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
-static struct datapath *get_dp(int dp_ifindex)
+/* Must be called with rcu_read_lock or ovs_mutex. */
+static struct datapath *get_dp(struct net *net, int dp_ifindex)
{
struct datapath *dp = NULL;
struct net_device *dev;
rcu_read_lock();
- dev = dev_get_by_index_rcu(&init_net, dp_ifindex);
+ dev = dev_get_by_index_rcu(net, dp_ifindex);
if (dev) {
struct vport *vport = ovs_internal_dev_get_vport(dev);
if (vport)
@@ -105,10 +157,10 @@ static struct datapath *get_dp(int dp_ifindex)
return dp;
}
-/* Must be called with rcu_read_lock or RTNL lock. */
-const char *ovs_dp_name(const struct datapath *dp)
+/* Must be called with rcu_read_lock or ovs_mutex. */
+static const char *ovs_dp_name(const struct datapath *dp)
{
- struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]);
+ struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
return vport->ops->get_name(vport);
}
@@ -119,9 +171,9 @@ static int get_dpifindex(struct datapath *dp)
rcu_read_lock();
- local = rcu_dereference(dp->ports[OVSP_LOCAL]);
+ local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
- ifindex = local->ops->get_ifindex(local);
+ ifindex = netdev_vport_priv(local)->dev->ifindex;
else
ifindex = 0;
@@ -134,12 +186,33 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
- ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
free_percpu(dp->stats_percpu);
+ release_net(ovs_dp_get_net(dp));
+ kfree(dp->ports);
kfree(dp);
}
-/* Called with RTNL lock and genl_lock. */
+static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
+ u16 port_no)
+{
+ return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
+{
+ struct vport *vport;
+ struct hlist_head *head;
+
+ head = vport_hash_bucket(dp, port_no);
+ hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
+ if (vport->port_no == port_no)
+ return vport;
+ }
+ return NULL;
+}
+
+/* Called with ovs_mutex. */
static struct vport *new_vport(const struct vport_parms *parms)
{
struct vport *vport;
@@ -147,22 +220,19 @@ static struct vport *new_vport(const struct vport_parms *parms)
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
+ struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
- rcu_assign_pointer(dp->ports[parms->port_no], vport);
- list_add(&vport->node, &dp->port_list);
+ hlist_add_head_rcu(&vport->dp_hash_node, head);
}
-
return vport;
}
-/* Called with RTNL lock. */
void ovs_dp_detach_port(struct vport *p)
{
- ASSERT_RTNL();
+ ASSERT_OVSL();
/* First drop references to device. */
- list_del(&p->node);
- rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
+ hlist_del_rcu(&p->dp_hash_node);
/* Then destroy it. */
ovs_vport_del(p);
@@ -176,27 +246,27 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
struct dp_stats_percpu *stats;
struct sw_flow_key key;
u64 *stats_counter;
+ u32 n_mask_hit;
int error;
- int key_len;
- stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
+ stats = this_cpu_ptr(dp->stats_percpu);
/* Extract flow from 'skb' into 'key'. */
- error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
+ error = ovs_flow_extract(skb, p->port_no, &key);
if (unlikely(error)) {
kfree_skb(skb);
return;
}
/* Look up flow. */
- flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
+ flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.key = &key;
upcall.userdata = NULL;
- upcall.pid = p->upcall_pid;
+ upcall.portid = p->upcall_portid;
ovs_dp_upcall(dp, skb, &upcall);
consume_skb(skb);
stats_counter = &stats->n_missed;
@@ -204,83 +274,71 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
}
OVS_CB(skb)->flow = flow;
+ OVS_CB(skb)->pkt_key = &key;
- stats_counter = &stats->n_hit;
- ovs_flow_used(OVS_CB(skb)->flow, skb);
+ ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb);
ovs_execute_actions(dp, skb);
+ stats_counter = &stats->n_hit;
out:
/* Update datapath statistics. */
- u64_stats_update_begin(&stats->sync);
+ u64_stats_update_begin(&stats->syncp);
(*stats_counter)++;
- u64_stats_update_end(&stats->sync);
+ stats->n_mask_hit += n_mask_hit;
+ u64_stats_update_end(&stats->syncp);
}
-static struct genl_family dp_packet_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = sizeof(struct ovs_header),
- .name = OVS_PACKET_FAMILY,
- .version = OVS_PACKET_VERSION,
- .maxattr = OVS_PACKET_ATTR_MAX
-};
-
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
- const struct dp_upcall_info *upcall_info)
+ const struct dp_upcall_info *upcall_info)
{
struct dp_stats_percpu *stats;
- int dp_ifindex;
int err;
- if (upcall_info->pid == 0) {
+ if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
}
- dp_ifindex = get_dpifindex(dp);
- if (!dp_ifindex) {
- err = -ENODEV;
- goto err;
- }
-
if (!skb_is_gso(skb))
- err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ err = queue_userspace_packet(dp, skb, upcall_info);
else
- err = queue_gso_packets(dp_ifindex, skb, upcall_info);
+ err = queue_gso_packets(dp, skb, upcall_info);
if (err)
goto err;
return 0;
err:
- stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
+ stats = this_cpu_ptr(dp->stats_percpu);
- u64_stats_update_begin(&stats->sync);
+ u64_stats_update_begin(&stats->syncp);
stats->n_lost++;
- u64_stats_update_end(&stats->sync);
+ u64_stats_update_end(&stats->syncp);
return err;
}
-static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
+static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
+ unsigned short gso_type = skb_shinfo(skb)->gso_type;
struct dp_upcall_info later_info;
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
int err;
- segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ segs = __skb_gso_segment(skb, NETIF_F_SG, false);
+ if (IS_ERR(segs))
+ return PTR_ERR(segs);
/* Queue all of the segments. */
skb = segs;
do {
- err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ err = queue_userspace_packet(dp, skb, upcall_info);
if (err)
break;
- if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+ if (skb == segs && gso_type & SKB_GSO_UDP) {
/* The initial flow key extracted by ovs_flow_extract()
* in this case is for a first fragment, so we need to
* properly mark later fragments.
@@ -306,23 +364,69 @@ static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
return err;
}
-static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
+static size_t key_attr_size(void)
+{
+ return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
+ + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */
+ + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
+ + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
+ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */
+ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */
+ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
+ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */
+ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */
+ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
+ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */
+ + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */
+ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */
+ + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */
+ + nla_total_size(28); /* OVS_KEY_ATTR_ND */
+}
+
+static size_t upcall_msg_size(const struct nlattr *userdata,
+ unsigned int hdrlen)
+{
+ size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
+
+ /* OVS_PACKET_ATTR_USERDATA */
+ if (userdata)
+ size += NLA_ALIGN(userdata->nla_len);
+
+ return size;
+}
+
+static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb; /* to be queued to userspace */
struct nlattr *nla;
- unsigned int len;
- int err;
+ struct genl_info info = {
+ .dst_sk = ovs_dp_get_net(dp)->genl_sock,
+ .snd_portid = upcall_info->portid,
+ };
+ size_t len;
+ unsigned int hlen;
+ int err, dp_ifindex;
+
+ dp_ifindex = get_dpifindex(dp);
+ if (!dp_ifindex)
+ return -ENODEV;
if (vlan_tx_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
- nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
- if (!skb)
+ nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
+ if (!nskb)
return -ENOMEM;
nskb->vlan_tci = 0;
@@ -334,13 +438,22 @@ static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
goto out;
}
- len = sizeof(struct ovs_header);
- len += nla_total_size(skb->len);
- len += nla_total_size(FLOW_BUFSIZE);
- if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
- len += nla_total_size(8);
+ /* Complete checksum if needed */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto out;
+
+ /* Older versions of OVS user space enforce alignment of the last
+ * Netlink attribute to NLA_ALIGNTO which would require extensive
+ * padding logic. Only perform zerocopy if padding is not required.
+ */
+ if (dp->user_features & OVS_DP_F_UNALIGNED)
+ hlen = skb_zerocopy_headlen(skb);
+ else
+ hlen = skb->len;
- user_skb = genlmsg_new(len, GFP_ATOMIC);
+ len = upcall_msg_size(upcall_info->userdata, hlen);
+ user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
@@ -351,242 +464,42 @@ static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
upcall->dp_ifindex = dp_ifindex;
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
- ovs_flow_to_nlattrs(upcall_info->key, user_skb);
+ ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb);
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
- nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
- nla_get_u64(upcall_info->userdata));
-
- nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
-
- skb_copy_and_csum_dev(skb, nla_data(nla));
-
- err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
-
-out:
- kfree_skb(nskb);
- return err;
-}
-
-/* Called with genl_mutex. */
-static int flush_flows(int dp_ifindex)
-{
- struct flow_table *old_table;
- struct flow_table *new_table;
- struct datapath *dp;
-
- dp = get_dp(dp_ifindex);
- if (!dp)
- return -ENODEV;
-
- old_table = genl_dereference(dp->table);
- new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
- if (!new_table)
- return -ENOMEM;
-
- rcu_assign_pointer(dp->table, new_table);
-
- ovs_flow_tbl_deferred_destroy(old_table);
- return 0;
-}
-
-static int validate_actions(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth);
-
-static int validate_sample(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth)
-{
- const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
- const struct nlattr *probability, *actions;
- const struct nlattr *a;
- int rem;
-
- memset(attrs, 0, sizeof(attrs));
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
- return -EINVAL;
- attrs[type] = a;
- }
- if (rem)
- return -EINVAL;
-
- probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
- if (!probability || nla_len(probability) != sizeof(u32))
- return -EINVAL;
-
- actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
- if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
- return -EINVAL;
- return validate_actions(actions, key, depth + 1);
-}
-
-static int validate_set(const struct nlattr *a,
- const struct sw_flow_key *flow_key)
-{
- const struct nlattr *ovs_key = nla_data(a);
- int key_type = nla_type(ovs_key);
-
- /* There can be only one key in a action */
- if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
- return -EINVAL;
-
- if (key_type > OVS_KEY_ATTR_MAX ||
- nla_len(ovs_key) != ovs_key_lens[key_type])
- return -EINVAL;
-
- switch (key_type) {
- const struct ovs_key_ipv4 *ipv4_key;
-
- case OVS_KEY_ATTR_PRIORITY:
- case OVS_KEY_ATTR_ETHERNET:
- break;
-
- case OVS_KEY_ATTR_IPV4:
- if (flow_key->eth.type != htons(ETH_P_IP))
- return -EINVAL;
-
- if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst)
- return -EINVAL;
-
- ipv4_key = nla_data(ovs_key);
- if (ipv4_key->ipv4_proto != flow_key->ip.proto)
- return -EINVAL;
-
- if (ipv4_key->ipv4_frag != flow_key->ip.frag)
- return -EINVAL;
-
- break;
-
- case OVS_KEY_ATTR_TCP:
- if (flow_key->ip.proto != IPPROTO_TCP)
- return -EINVAL;
-
- if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
- return -EINVAL;
-
- break;
-
- case OVS_KEY_ATTR_UDP:
- if (flow_key->ip.proto != IPPROTO_UDP)
- return -EINVAL;
-
- if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
- return -EINVAL;
- break;
-
- default:
- return -EINVAL;
+ __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
+ nla_len(upcall_info->userdata),
+ nla_data(upcall_info->userdata));
+
+ /* Only reserve room for attribute header, packet data is added
+ * in skb_zerocopy() */
+ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
+ err = -ENOBUFS;
+ goto out;
}
+ nla->nla_len = nla_attr_size(skb->len);
- return 0;
-}
-
-static int validate_userspace(const struct nlattr *attr)
-{
- static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
- [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
- [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
- };
- struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
- int error;
-
- error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
- attr, userspace_policy);
- if (error)
- return error;
-
- if (!a[OVS_USERSPACE_ATTR_PID] ||
- !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
- return -EINVAL;
-
- return 0;
-}
-
-static int validate_actions(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth)
-{
- const struct nlattr *a;
- int rem, err;
-
- if (depth >= SAMPLE_ACTION_DEPTH)
- return -EOVERFLOW;
-
- nla_for_each_nested(a, attr, rem) {
- /* Expected argument lengths, (u32)-1 for variable length. */
- static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
- [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
- [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
- [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
- [OVS_ACTION_ATTR_POP_VLAN] = 0,
- [OVS_ACTION_ATTR_SET] = (u32)-1,
- [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
- };
- const struct ovs_action_push_vlan *vlan;
- int type = nla_type(a);
-
- if (type > OVS_ACTION_ATTR_MAX ||
- (action_lens[type] != nla_len(a) &&
- action_lens[type] != (u32)-1))
- return -EINVAL;
-
- switch (type) {
- case OVS_ACTION_ATTR_UNSPEC:
- return -EINVAL;
-
- case OVS_ACTION_ATTR_USERSPACE:
- err = validate_userspace(a);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_OUTPUT:
- if (nla_get_u32(a) >= DP_MAX_PORTS)
- return -EINVAL;
- break;
-
-
- case OVS_ACTION_ATTR_POP_VLAN:
- break;
-
- case OVS_ACTION_ATTR_PUSH_VLAN:
- vlan = nla_data(a);
- if (vlan->vlan_tpid != htons(ETH_P_8021Q))
- return -EINVAL;
- if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
- return -EINVAL;
- break;
-
- case OVS_ACTION_ATTR_SET:
- err = validate_set(a, key);
- if (err)
- return err;
- break;
+ err = skb_zerocopy(user_skb, skb, skb->len, hlen);
+ if (err)
+ goto out;
- case OVS_ACTION_ATTR_SAMPLE:
- err = validate_sample(a, key, depth);
- if (err)
- return err;
- break;
+ /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
+ if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+ size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
- default:
- return -EINVAL;
- }
+ if (plen > 0)
+ memset(skb_put(user_skb, plen), 0, plen);
}
- if (rem > 0)
- return -EINVAL;
+ ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
- return 0;
-}
-
-static void clear_stats(struct sw_flow *flow)
-{
- flow->used = 0;
- flow->tcp_flags = 0;
- flow->packet_count = 0;
- flow->byte_count = 0;
+ err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
+out:
+ if (err)
+ skb_tx_error(skb);
+ kfree_skb(nskb);
+ return err;
}
static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
@@ -600,12 +513,10 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct ethhdr *eth;
int len;
int err;
- int key_len;
err = -EINVAL;
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
- !a[OVS_PACKET_ATTR_ACTIONS] ||
- nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
+ !a[OVS_PACKET_ATTR_ACTIONS])
goto err;
len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
@@ -615,7 +526,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
goto err;
skb_reserve(packet, NET_IP_ALIGN);
- memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
+ nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
skb_reset_mac_header(packet);
eth = eth_hdr(packet);
@@ -623,7 +534,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have. */
- if (ntohs(eth->h_proto) >= 1536)
+ if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
@@ -634,33 +545,31 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(flow))
goto err_kfree_skb;
- err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
+ err = ovs_flow_extract(packet, -1, &flow->key);
if (err)
goto err_flow_free;
- err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
- &flow->key.phy.in_port,
- a[OVS_PACKET_ATTR_KEY]);
+ err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
-
- err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
- if (err)
- goto err_flow_free;
-
- flow->hash = ovs_flow_hash(&flow->key, key_len);
-
- acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
err = PTR_ERR(acts);
if (IS_ERR(acts))
goto err_flow_free;
+
+ err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
+ &flow->key, 0, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
+ if (err)
+ goto err_flow_free;
OVS_CB(packet)->flow = flow;
+ OVS_CB(packet)->pkt_key = &flow->key;
packet->priority = flow->key.phy.priority;
+ packet->mark = flow->key.phy.skb_mark;
rcu_read_lock();
- dp = get_dp(ovs_header->dp_ifindex);
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto err_unlock;
@@ -670,13 +579,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
local_bh_enable();
rcu_read_unlock();
- ovs_flow_free(flow);
+ ovs_flow_free(flow, false);
return err;
err_unlock:
rcu_read_unlock();
err_flow_free:
- ovs_flow_free(flow);
+ ovs_flow_free(flow, false);
err_kfree_skb:
kfree_skb(packet);
err:
@@ -684,12 +593,12 @@ err:
}
static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
- [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC },
+ [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
};
-static struct genl_ops dp_packet_genl_ops[] = {
+static const struct genl_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = packet_policy,
@@ -697,14 +606,30 @@ static struct genl_ops dp_packet_genl_ops[] = {
}
};
-static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
+static struct genl_family dp_packet_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_PACKET_FAMILY,
+ .version = OVS_PACKET_VERSION,
+ .maxattr = OVS_PACKET_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_packet_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+};
+
+static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
+ struct ovs_dp_megaflow_stats *mega_stats)
{
int i;
- struct flow_table *table = genl_dereference(dp->table);
- stats->n_flows = ovs_flow_tbl_count(table);
+ memset(mega_stats, 0, sizeof(*mega_stats));
+
+ stats->n_flows = ovs_flow_tbl_count(&dp->table);
+ mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
stats->n_hit = stats->n_missed = stats->n_lost = 0;
+
for_each_possible_cpu(i) {
const struct dp_stats_percpu *percpu_stats;
struct dp_stats_percpu local_stats;
@@ -713,81 +638,81 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
do {
- start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
+ start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
+ } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
+ mega_stats->n_mask_hit += local_stats.n_mask_hit;
}
}
-static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
- [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
- [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
- [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
-};
-
-static struct genl_family dp_flow_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = sizeof(struct ovs_header),
- .name = OVS_FLOW_FAMILY,
- .version = OVS_FLOW_VERSION,
- .maxattr = OVS_FLOW_ATTR_MAX
-};
-
-static struct genl_multicast_group ovs_dp_flow_multicast_group = {
- .name = OVS_FLOW_MCGROUP
-};
+static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
+{
+ return NLMSG_ALIGN(sizeof(struct ovs_header))
+ + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+ + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
+ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
+ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */
+ + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */
+}
-/* Called with genl_lock. */
-static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
- struct sk_buff *skb, u32 pid,
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
+ struct sk_buff *skb, u32 portid,
u32 seq, u32 flags, u8 cmd)
{
const int skb_orig_len = skb->len;
- const struct sw_flow_actions *sf_acts;
+ struct nlattr *start;
struct ovs_flow_stats stats;
+ __be16 tcp_flags;
+ unsigned long used;
struct ovs_header *ovs_header;
struct nlattr *nla;
- unsigned long used;
- u8 tcp_flags;
int err;
- sf_acts = rcu_dereference_protected(flow->sf_acts,
- lockdep_genl_is_held());
-
- ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd);
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
- ovs_header->dp_ifindex = get_dpifindex(dp);
+ ovs_header->dp_ifindex = dp_ifindex;
+ /* Fill flow key. */
nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
if (!nla)
goto nla_put_failure;
- err = ovs_flow_to_nlattrs(&flow->key, skb);
+
+ err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
if (err)
goto error;
nla_nest_end(skb, nla);
- spin_lock_bh(&flow->lock);
- used = flow->used;
- stats.n_packets = flow->packet_count;
- stats.n_bytes = flow->byte_count;
- tcp_flags = flow->tcp_flags;
- spin_unlock_bh(&flow->lock);
+ nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
+ if (!nla)
+ goto nla_put_failure;
- if (used)
- NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used));
+ err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
+ if (err)
+ goto error;
- if (stats.n_packets)
- NLA_PUT(skb, OVS_FLOW_ATTR_STATS,
- sizeof(struct ovs_flow_stats), &stats);
+ nla_nest_end(skb, nla);
- if (tcp_flags)
- NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags);
+ ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
+
+ if (used &&
+ nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
+ goto nla_put_failure;
+
+ if (stats.n_packets &&
+ nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
+ goto nla_put_failure;
+
+ if ((u8)ntohs(tcp_flags) &&
+ nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
+ goto nla_put_failure;
/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
* this is the first flow to be dumped into 'skb'. This is unusual for
@@ -799,10 +724,24 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
* This can only fail for dump operations because the skb is always
* properly sized for single flows.
*/
- err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
- sf_acts->actions);
- if (err < 0 && skb_orig_len)
- goto error;
+ start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
+ if (start) {
+ const struct sw_flow_actions *sf_acts;
+
+ sf_acts = rcu_dereference_ovsl(flow->sf_acts);
+ err = ovs_nla_put_actions(sf_acts->actions,
+ sf_acts->actions_len, skb);
+
+ if (!err)
+ nla_nest_end(skb, start);
+ else {
+ if (skb_orig_len)
+ goto error;
+
+ nla_nest_cancel(skb, start);
+ }
+ } else if (skb_orig_len)
+ goto nla_put_failure;
return genlmsg_end(skb, ovs_header);
@@ -813,130 +752,129 @@ error:
return err;
}
-static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
+/* May not be called with RCU read lock. */
+static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
+ struct genl_info *info,
+ bool always)
{
- const struct sw_flow_actions *sf_acts;
- int len;
-
- sf_acts = rcu_dereference_protected(flow->sf_acts,
- lockdep_genl_is_held());
+ struct sk_buff *skb;
- /* OVS_FLOW_ATTR_KEY */
- len = nla_total_size(FLOW_BUFSIZE);
- /* OVS_FLOW_ATTR_ACTIONS */
- len += nla_total_size(sf_acts->actions_len);
- /* OVS_FLOW_ATTR_STATS */
- len += nla_total_size(sizeof(struct ovs_flow_stats));
- /* OVS_FLOW_ATTR_TCP_FLAGS */
- len += nla_total_size(1);
- /* OVS_FLOW_ATTR_USED */
- len += nla_total_size(8);
+ if (!always && !ovs_must_notify(info, &ovs_dp_flow_multicast_group))
+ return NULL;
- len += NLMSG_ALIGN(sizeof(struct ovs_header));
+ skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
- return genlmsg_new(len, GFP_KERNEL);
+ return skb;
}
-static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
- struct datapath *dp,
- u32 pid, u32 seq, u8 cmd)
+/* Called with ovs_mutex. */
+static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
+ int dp_ifindex,
+ struct genl_info *info, u8 cmd,
+ bool always)
{
struct sk_buff *skb;
int retval;
- skb = ovs_flow_cmd_alloc_info(flow);
- if (!skb)
- return ERR_PTR(-ENOMEM);
+ skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info,
+ always);
+ if (!skb || IS_ERR(skb))
+ return skb;
- retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd);
+ retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
+ info->snd_portid, info->snd_seq, 0,
+ cmd);
BUG_ON(retval < 0);
return skb;
}
-static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
+static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
- struct sw_flow_key key;
- struct sw_flow *flow;
+ struct sw_flow *flow, *new_flow;
+ struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
- struct flow_table *table;
+ struct sw_flow_actions *acts;
+ struct sw_flow_match match;
int error;
- int key_len;
- /* Extract key. */
+ /* Must have key and actions. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY])
goto error;
- error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
- if (error)
+ if (!a[OVS_FLOW_ATTR_ACTIONS])
goto error;
- /* Validate actions. */
- if (a[OVS_FLOW_ATTR_ACTIONS]) {
- error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0);
- if (error)
- goto error;
- } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
- error = -EINVAL;
+ /* Most of the time we need to allocate a new flow, do it before
+ * locking.
+ */
+ new_flow = ovs_flow_alloc();
+ if (IS_ERR(new_flow)) {
+ error = PTR_ERR(new_flow);
goto error;
}
- dp = get_dp(ovs_header->dp_ifindex);
- error = -ENODEV;
- if (!dp)
- goto error;
-
- table = genl_dereference(dp->table);
- flow = ovs_flow_tbl_lookup(table, &key, key_len);
- if (!flow) {
- struct sw_flow_actions *acts;
+ /* Extract key. */
+ ovs_match_init(&match, &new_flow->unmasked_key, &mask);
+ error = ovs_nla_get_match(&match,
+ a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
+ if (error)
+ goto err_kfree_flow;
- /* Bail out if we're not allowed to create a new flow. */
- error = -ENOENT;
- if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
- goto error;
+ ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask);
- /* Expand table, if necessary, to make room. */
- if (ovs_flow_tbl_need_to_expand(table)) {
- struct flow_table *new_table;
+ /* Validate actions. */
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
+ error = PTR_ERR(acts);
+ if (IS_ERR(acts))
+ goto err_kfree_flow;
- new_table = ovs_flow_tbl_expand(table);
- if (!IS_ERR(new_table)) {
- rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_deferred_destroy(table);
- table = genl_dereference(dp->table);
- }
- }
+ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
+ 0, &acts);
+ if (error) {
+ OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
+ goto err_kfree_acts;
+ }
- /* Allocate flow. */
- flow = ovs_flow_alloc();
- if (IS_ERR(flow)) {
- error = PTR_ERR(flow);
- goto error;
- }
- flow->key = key;
- clear_stats(flow);
+ reply = ovs_flow_cmd_alloc_info(acts, info, false);
+ if (IS_ERR(reply)) {
+ error = PTR_ERR(reply);
+ goto err_kfree_acts;
+ }
- /* Obtain actions. */
- acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
- error = PTR_ERR(acts);
- if (IS_ERR(acts))
- goto error_free_flow;
- rcu_assign_pointer(flow->sf_acts, acts);
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (unlikely(!dp)) {
+ error = -ENODEV;
+ goto err_unlock_ovs;
+ }
+ /* Check if this is a duplicate flow */
+ flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key);
+ if (likely(!flow)) {
+ rcu_assign_pointer(new_flow->sf_acts, acts);
/* Put flow in bucket. */
- flow->hash = ovs_flow_hash(&key, key_len);
- ovs_flow_tbl_insert(table, flow);
+ error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
+ if (unlikely(error)) {
+ acts = NULL;
+ goto err_unlock_ovs;
+ }
- reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
- info->snd_seq,
- OVS_FLOW_CMD_NEW);
+ if (unlikely(reply)) {
+ error = ovs_flow_cmd_fill_info(new_flow,
+ ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_NEW);
+ BUG_ON(error < 0);
+ }
+ ovs_unlock();
} else {
- /* We found a matching flow. */
struct sw_flow_actions *old_acts;
- struct nlattr *acts_attrs;
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
@@ -944,52 +882,154 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
* request. We also accept NLM_F_EXCL in case that bug ever
* gets fixed.
*/
- error = -EEXIST;
- if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
- info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
+ if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
+ | NLM_F_EXCL))) {
+ error = -EEXIST;
+ goto err_unlock_ovs;
+ }
+ /* The unmasked key has to be the same for flow updates. */
+ if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (!flow) {
+ error = -ENOENT;
+ goto err_unlock_ovs;
+ }
+ }
+ /* Update actions. */
+ old_acts = ovsl_dereference(flow->sf_acts);
+ rcu_assign_pointer(flow->sf_acts, acts);
+
+ if (unlikely(reply)) {
+ error = ovs_flow_cmd_fill_info(flow,
+ ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_NEW);
+ BUG_ON(error < 0);
+ }
+ ovs_unlock();
+
+ ovs_nla_free_flow_actions(old_acts);
+ ovs_flow_free(new_flow, false);
+ }
+
+ if (reply)
+ ovs_notify(&dp_flow_genl_family, reply, info);
+ return 0;
+
+err_unlock_ovs:
+ ovs_unlock();
+ kfree_skb(reply);
+err_kfree_acts:
+ kfree(acts);
+err_kfree_flow:
+ ovs_flow_free(new_flow, false);
+error:
+ return error;
+}
+
+static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key, masked_key;
+ struct sw_flow *flow;
+ struct sw_flow_mask mask;
+ struct sk_buff *reply = NULL;
+ struct datapath *dp;
+ struct sw_flow_actions *old_acts = NULL, *acts = NULL;
+ struct sw_flow_match match;
+ int error;
+
+ /* Extract key. */
+ error = -EINVAL;
+ if (!a[OVS_FLOW_ATTR_KEY])
+ goto error;
+
+ ovs_match_init(&match, &key, &mask);
+ error = ovs_nla_get_match(&match,
+ a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
+ if (error)
+ goto error;
+
+ /* Validate actions. */
+ if (a[OVS_FLOW_ATTR_ACTIONS]) {
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
+ error = PTR_ERR(acts);
+ if (IS_ERR(acts))
goto error;
- /* Update actions. */
- old_acts = rcu_dereference_protected(flow->sf_acts,
- lockdep_genl_is_held());
- acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
- if (acts_attrs &&
- (old_acts->actions_len != nla_len(acts_attrs) ||
- memcmp(old_acts->actions, nla_data(acts_attrs),
- old_acts->actions_len))) {
- struct sw_flow_actions *new_acts;
-
- new_acts = ovs_flow_actions_alloc(acts_attrs);
- error = PTR_ERR(new_acts);
- if (IS_ERR(new_acts))
- goto error;
+ ovs_flow_mask_key(&masked_key, &key, &mask);
+ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
+ &masked_key, 0, &acts);
+ if (error) {
+ OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
+ goto err_kfree_acts;
+ }
+ }
- rcu_assign_pointer(flow->sf_acts, new_acts);
- ovs_flow_deferred_free_acts(old_acts);
+ /* Can allocate before locking if have acts. */
+ if (acts) {
+ reply = ovs_flow_cmd_alloc_info(acts, info, false);
+ if (IS_ERR(reply)) {
+ error = PTR_ERR(reply);
+ goto err_kfree_acts;
}
+ }
- reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
- info->snd_seq, OVS_FLOW_CMD_NEW);
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (unlikely(!dp)) {
+ error = -ENODEV;
+ goto err_unlock_ovs;
+ }
+ /* Check that the flow exists. */
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (unlikely(!flow)) {
+ error = -ENOENT;
+ goto err_unlock_ovs;
+ }
+
+ /* Update actions, if present. */
+ if (likely(acts)) {
+ old_acts = ovsl_dereference(flow->sf_acts);
+ rcu_assign_pointer(flow->sf_acts, acts);
- /* Clear stats. */
- if (a[OVS_FLOW_ATTR_CLEAR]) {
- spin_lock_bh(&flow->lock);
- clear_stats(flow);
- spin_unlock_bh(&flow->lock);
+ if (unlikely(reply)) {
+ error = ovs_flow_cmd_fill_info(flow,
+ ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_NEW);
+ BUG_ON(error < 0);
+ }
+ } else {
+ /* Could not alloc without acts before locking. */
+ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
+ info, OVS_FLOW_CMD_NEW, false);
+ if (unlikely(IS_ERR(reply))) {
+ error = PTR_ERR(reply);
+ goto err_unlock_ovs;
}
}
- if (!IS_ERR(reply))
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_flow_multicast_group.id, info->nlhdr,
- GFP_KERNEL);
- else
- netlink_set_err(init_net.genl_sock, 0,
- ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
+ /* Clear stats. */
+ if (a[OVS_FLOW_ATTR_CLEAR])
+ ovs_flow_stats_clear(flow);
+ ovs_unlock();
+
+ if (reply)
+ ovs_notify(&dp_flow_genl_family, reply, info);
+ if (old_acts)
+ ovs_nla_free_flow_actions(old_acts);
+
return 0;
-error_free_flow:
- ovs_flow_free(flow);
+err_unlock_ovs:
+ ovs_unlock();
+ kfree_skb(reply);
+err_kfree_acts:
+ kfree(acts);
error:
return error;
}
@@ -1002,31 +1042,44 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
- struct flow_table *table;
+ struct sw_flow_match match;
int err;
- int key_len;
- if (!a[OVS_FLOW_ATTR_KEY])
+ if (!a[OVS_FLOW_ATTR_KEY]) {
+ OVS_NLERR("Flow get message rejected, Key attribute missing.\n");
return -EINVAL;
- err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+ }
+
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
return err;
- dp = get_dp(ovs_header->dp_ifindex);
- if (!dp)
- return -ENODEV;
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ err = -ENODEV;
+ goto unlock;
+ }
- table = genl_dereference(dp->table);
- flow = ovs_flow_tbl_lookup(table, &key, key_len);
- if (!flow)
- return -ENOENT;
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (!flow) {
+ err = -ENOENT;
+ goto unlock;
+ }
- reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
- info->snd_seq, OVS_FLOW_CMD_NEW);
- if (IS_ERR(reply))
- return PTR_ERR(reply);
+ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
+ OVS_FLOW_CMD_NEW, true);
+ if (IS_ERR(reply)) {
+ err = PTR_ERR(reply);
+ goto unlock;
+ }
+ ovs_unlock();
return genlmsg_reply(reply, info);
+unlock:
+ ovs_unlock();
+ return err;
}
static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
@@ -1037,66 +1090,88 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
- struct flow_table *table;
+ struct sw_flow_match match;
int err;
- int key_len;
- if (!a[OVS_FLOW_ATTR_KEY])
- return flush_flows(ovs_header->dp_ifindex);
- err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
- if (err)
- return err;
-
- dp = get_dp(ovs_header->dp_ifindex);
- if (!dp)
- return -ENODEV;
-
- table = genl_dereference(dp->table);
- flow = ovs_flow_tbl_lookup(table, &key, key_len);
- if (!flow)
- return -ENOENT;
+ if (likely(a[OVS_FLOW_ATTR_KEY])) {
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
+ if (unlikely(err))
+ return err;
+ }
- reply = ovs_flow_cmd_alloc_info(flow);
- if (!reply)
- return -ENOMEM;
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (unlikely(!dp)) {
+ err = -ENODEV;
+ goto unlock;
+ }
- ovs_flow_tbl_remove(table, flow);
+ if (unlikely(!a[OVS_FLOW_ATTR_KEY])) {
+ err = ovs_flow_tbl_flush(&dp->table);
+ goto unlock;
+ }
- err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
- info->snd_seq, 0, OVS_FLOW_CMD_DEL);
- BUG_ON(err < 0);
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (unlikely(!flow)) {
+ err = -ENOENT;
+ goto unlock;
+ }
- ovs_flow_deferred_free(flow);
+ ovs_flow_tbl_remove(&dp->table, flow);
+ ovs_unlock();
+
+ reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
+ info, false);
+ if (likely(reply)) {
+ if (likely(!IS_ERR(reply))) {
+ rcu_read_lock(); /*To keep RCU checker happy. */
+ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_DEL);
+ rcu_read_unlock();
+ BUG_ON(err < 0);
+
+ ovs_notify(&dp_flow_genl_family, reply, info);
+ } else {
+ netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
+ }
+ }
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL);
+ ovs_flow_free(flow, true);
return 0;
+unlock:
+ ovs_unlock();
+ return err;
}
static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct table_instance *ti;
struct datapath *dp;
- struct flow_table *table;
- dp = get_dp(ovs_header->dp_ifindex);
- if (!dp)
+ rcu_read_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ rcu_read_unlock();
return -ENODEV;
+ }
- table = genl_dereference(dp->table);
-
+ ti = rcu_dereference(dp->table.ti);
for (;;) {
struct sw_flow *flow;
u32 bucket, obj;
bucket = cb->args[0];
obj = cb->args[1];
- flow = ovs_flow_tbl_next(table, &bucket, &obj);
+ flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
if (!flow)
break;
- if (ovs_flow_cmd_fill_info(flow, dp, skb,
- NETLINK_CB(cb->skb).pid,
+ if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_FLOW_CMD_NEW) < 0)
break;
@@ -1104,14 +1179,21 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[0] = bucket;
cb->args[1] = obj;
}
+ rcu_read_unlock();
return skb->len;
}
+static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
+ [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
+};
+
static struct genl_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
- .doit = ovs_flow_cmd_new_or_set
+ .doit = ovs_flow_cmd_new
},
{ .cmd = OVS_FLOW_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1127,49 +1209,68 @@ static struct genl_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
- .doit = ovs_flow_cmd_new_or_set,
+ .doit = ovs_flow_cmd_set,
},
};
-static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
- [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
- [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
-};
-
-static struct genl_family dp_datapath_genl_family = {
+static struct genl_family dp_flow_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
- .name = OVS_DATAPATH_FAMILY,
- .version = OVS_DATAPATH_VERSION,
- .maxattr = OVS_DP_ATTR_MAX
+ .name = OVS_FLOW_FAMILY,
+ .version = OVS_FLOW_VERSION,
+ .maxattr = OVS_FLOW_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_flow_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
+ .mcgrps = &ovs_dp_flow_multicast_group,
+ .n_mcgrps = 1,
};
-static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
- .name = OVS_DATAPATH_MCGROUP
-};
+static size_t ovs_dp_cmd_msg_size(void)
+{
+ size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
+
+ msgsize += nla_total_size(IFNAMSIZ);
+ msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
+ msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
+ return msgsize;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
- u32 pid, u32 seq, u32 flags, u8 cmd)
+ u32 portid, u32 seq, u32 flags, u8 cmd)
{
struct ovs_header *ovs_header;
struct ovs_dp_stats dp_stats;
+ struct ovs_dp_megaflow_stats dp_megaflow_stats;
int err;
- ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
flags, cmd);
if (!ovs_header)
goto error;
ovs_header->dp_ifindex = get_dpifindex(dp);
- rcu_read_lock();
err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
- rcu_read_unlock();
if (err)
goto nla_put_failure;
- get_dp_stats(dp, &dp_stats);
- NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats);
+ get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
+ if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
+ &dp_stats))
+ goto nla_put_failure;
+
+ if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
+ sizeof(struct ovs_dp_megaflow_stats),
+ &dp_megaflow_stats))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
+ goto nla_put_failure;
return genlmsg_end(skb, ovs_header);
@@ -1179,43 +1280,47 @@ error:
return -EMSGSIZE;
}
-static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid,
- u32 seq, u8 cmd)
+static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
{
- struct sk_buff *skb;
- int retval;
-
- skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!skb)
- return ERR_PTR(-ENOMEM);
-
- retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
- if (retval < 0) {
- kfree_skb(skb);
- return ERR_PTR(retval);
- }
- return skb;
+ return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
}
-/* Called with genl_mutex and optionally with RTNL lock also. */
-static struct datapath *lookup_datapath(struct ovs_header *ovs_header,
+/* Called with rcu_read_lock or ovs_mutex. */
+static struct datapath *lookup_datapath(struct net *net,
+ struct ovs_header *ovs_header,
struct nlattr *a[OVS_DP_ATTR_MAX + 1])
{
struct datapath *dp;
if (!a[OVS_DP_ATTR_NAME])
- dp = get_dp(ovs_header->dp_ifindex);
+ dp = get_dp(net, ovs_header->dp_ifindex);
else {
struct vport *vport;
- rcu_read_lock();
- vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME]));
+ vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
- rcu_read_unlock();
}
return dp ? dp : ERR_PTR(-ENODEV);
}
+static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
+{
+ struct datapath *dp;
+
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ if (IS_ERR(dp))
+ return;
+
+ WARN(dp->user_features, "Dropping previously announced user features\n");
+ dp->user_features = 0;
+}
+
+static void ovs_dp_change(struct datapath *dp, struct nlattr **a)
+{
+ if (a[OVS_DP_ATTR_USER_FEATURES])
+ dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+}
+
static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
@@ -1223,42 +1328,57 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct datapath *dp;
struct vport *vport;
- int err;
+ struct ovs_net *ovs_net;
+ int err, i;
err = -EINVAL;
if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
goto err;
- rtnl_lock();
- err = -ENODEV;
- if (!try_module_get(THIS_MODULE))
- goto err_unlock_rtnl;
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
err = -ENOMEM;
dp = kzalloc(sizeof(*dp), GFP_KERNEL);
if (dp == NULL)
- goto err_put_module;
- INIT_LIST_HEAD(&dp->port_list);
+ goto err_free_reply;
+
+ ovs_dp_set_net(dp, hold_net(sock_net(skb->sk)));
/* Allocate table. */
- err = -ENOMEM;
- rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
- if (!dp->table)
+ err = ovs_flow_tbl_init(&dp->table);
+ if (err)
goto err_free_dp;
- dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
+ dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
if (!dp->stats_percpu) {
err = -ENOMEM;
goto err_destroy_table;
}
+ dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!dp->ports) {
+ err = -ENOMEM;
+ goto err_destroy_percpu;
+ }
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(&dp->ports[i]);
+
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
parms.options = NULL;
parms.dp = dp;
parms.port_no = OVSP_LOCAL;
- parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
+ parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
+
+ ovs_dp_change(dp, a);
+
+ /* So far only local changes have been made, now need the lock. */
+ ovs_lock();
vport = new_vport(&parms);
if (IS_ERR(vport)) {
@@ -1266,83 +1386,103 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (err == -EBUSY)
err = -EEXIST;
- goto err_destroy_percpu;
+ if (err == -EEXIST) {
+ /* An outdated user space instance that does not understand
+ * the concept of user_features has attempted to create a new
+ * datapath and is likely to reuse it. Drop all user features.
+ */
+ if (info->genlhdr->version < OVS_DP_VER_FEATURES)
+ ovs_dp_reset_user_features(skb, info);
+ }
+
+ goto err_destroy_ports_array;
}
- reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
- info->snd_seq, OVS_DP_CMD_NEW);
- err = PTR_ERR(reply);
- if (IS_ERR(reply))
- goto err_destroy_local_port;
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_NEW);
+ BUG_ON(err < 0);
+
+ ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
+ list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
- list_add_tail(&dp->list_node, &dps);
- rtnl_unlock();
+ ovs_unlock();
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_datapath_multicast_group.id, info->nlhdr,
- GFP_KERNEL);
+ ovs_notify(&dp_datapath_genl_family, reply, info);
return 0;
-err_destroy_local_port:
- ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
+err_destroy_ports_array:
+ ovs_unlock();
+ kfree(dp->ports);
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
- ovs_flow_tbl_destroy(genl_dereference(dp->table));
+ ovs_flow_tbl_destroy(&dp->table, false);
err_free_dp:
+ release_net(ovs_dp_get_net(dp));
kfree(dp);
-err_put_module:
- module_put(THIS_MODULE);
-err_unlock_rtnl:
- rtnl_unlock();
+err_free_reply:
+ kfree_skb(reply);
err:
return err;
}
+/* Called with ovs_mutex. */
+static void __dp_destroy(struct datapath *dp)
+{
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ struct vport *vport;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
+ if (vport->port_no != OVSP_LOCAL)
+ ovs_dp_detach_port(vport);
+ }
+
+ list_del_rcu(&dp->list_node);
+
+ /* OVSP_LOCAL is datapath internal port. We need to make sure that
+ * all ports in datapath are destroyed first before freeing datapath.
+ */
+ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
+
+ /* RCU destroy the flow table */
+ ovs_flow_tbl_destroy(&dp->table, true);
+
+ call_rcu(&dp->rcu, destroy_dp_rcu);
+}
+
static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
- struct vport *vport, *next_vport;
struct sk_buff *reply;
struct datapath *dp;
int err;
- rtnl_lock();
- dp = lookup_datapath(info->userhdr, info->attrs);
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
err = PTR_ERR(dp);
if (IS_ERR(dp))
- goto exit_unlock;
-
- reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
- info->snd_seq, OVS_DP_CMD_DEL);
- err = PTR_ERR(reply);
- if (IS_ERR(reply))
- goto exit_unlock;
-
- list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
- if (vport->port_no != OVSP_LOCAL)
- ovs_dp_detach_port(vport);
-
- list_del(&dp->list_node);
- ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
+ goto err_unlock_free;
- /* rtnl_unlock() will wait until all the references to devices that
- * are pending unregistration have been dropped. We do it here to
- * ensure that any internal devices (which contain DP pointers) are
- * fully destroyed before freeing the datapath.
- */
- rtnl_unlock();
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_DEL);
+ BUG_ON(err < 0);
- call_rcu(&dp->rcu, destroy_dp_rcu);
- module_put(THIS_MODULE);
+ __dp_destroy(dp);
+ ovs_unlock();
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_datapath_multicast_group.id, info->nlhdr,
- GFP_KERNEL);
+ ovs_notify(&dp_datapath_genl_family, reply, info);
return 0;
-exit_unlock:
- rtnl_unlock();
+err_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
return err;
}
@@ -1352,63 +1492,91 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
int err;
- dp = lookup_datapath(info->userhdr, info->attrs);
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ err = PTR_ERR(dp);
if (IS_ERR(dp))
- return PTR_ERR(dp);
+ goto err_unlock_free;
- reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
- info->snd_seq, OVS_DP_CMD_NEW);
- if (IS_ERR(reply)) {
- err = PTR_ERR(reply);
- netlink_set_err(init_net.genl_sock, 0,
- ovs_dp_datapath_multicast_group.id, err);
- return 0;
- }
+ ovs_dp_change(dp, info->attrs);
+
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_NEW);
+ BUG_ON(err < 0);
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_datapath_multicast_group.id, info->nlhdr,
- GFP_KERNEL);
+ ovs_unlock();
+ ovs_notify(&dp_datapath_genl_family, reply, info);
return 0;
+
+err_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
}
static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
+ int err;
- dp = lookup_datapath(info->userhdr, info->attrs);
- if (IS_ERR(dp))
- return PTR_ERR(dp);
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
- reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
- info->snd_seq, OVS_DP_CMD_NEW);
- if (IS_ERR(reply))
- return PTR_ERR(reply);
+ rcu_read_lock();
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ if (IS_ERR(dp)) {
+ err = PTR_ERR(dp);
+ goto err_unlock_free;
+ }
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_NEW);
+ BUG_ON(err < 0);
+ rcu_read_unlock();
return genlmsg_reply(reply, info);
+
+err_unlock_free:
+ rcu_read_unlock();
+ kfree_skb(reply);
+ return err;
}
static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
struct datapath *dp;
int skip = cb->args[0];
int i = 0;
- list_for_each_entry(dp, &dps, list_node) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
if (i >= skip &&
- ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
+ ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_DP_CMD_NEW) < 0)
break;
i++;
}
+ rcu_read_unlock();
cb->args[0] = i;
return skb->len;
}
+static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
+ [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+ [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
+};
+
static struct genl_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1433,50 +1601,45 @@ static struct genl_ops dp_datapath_genl_ops[] = {
},
};
-static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
- [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
- [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
- [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
- [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
- [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
- [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
-};
-
-static struct genl_family dp_vport_genl_family = {
+static struct genl_family dp_datapath_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
- .name = OVS_VPORT_FAMILY,
- .version = OVS_VPORT_VERSION,
- .maxattr = OVS_VPORT_ATTR_MAX
-};
-
-struct genl_multicast_group ovs_dp_vport_multicast_group = {
- .name = OVS_VPORT_MCGROUP
+ .name = OVS_DATAPATH_FAMILY,
+ .version = OVS_DATAPATH_VERSION,
+ .maxattr = OVS_DP_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_datapath_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
+ .mcgrps = &ovs_dp_datapath_multicast_group,
+ .n_mcgrps = 1,
};
-/* Called with RTNL lock or RCU read lock. */
+/* Called with ovs_mutex or RCU read lock. */
static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
- u32 pid, u32 seq, u32 flags, u8 cmd)
+ u32 portid, u32 seq, u32 flags, u8 cmd)
{
struct ovs_header *ovs_header;
struct ovs_vport_stats vport_stats;
int err;
- ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
ovs_header->dp_ifindex = get_dpifindex(vport->dp);
- NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
- NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type);
- NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport));
- NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid);
+ if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
+ nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
+ nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) ||
+ nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid))
+ goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
- NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
- &vport_stats);
+ if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
+ &vport_stats))
+ goto nla_put_failure;
err = ovs_vport_get_options(vport, skb);
if (err == -EMSGSIZE)
@@ -1491,8 +1654,13 @@ error:
return err;
}
-/* Called with RTNL lock or RCU read lock. */
-struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
+static struct sk_buff *ovs_vport_cmd_alloc_info(void)
+{
+ return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+}
+
+/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
u32 seq, u8 cmd)
{
struct sk_buff *skb;
@@ -1502,23 +1670,22 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
if (!skb)
return ERR_PTR(-ENOMEM);
- retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd);
- if (retval < 0) {
- kfree_skb(skb);
- return ERR_PTR(retval);
- }
+ retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
+ BUG_ON(retval < 0);
+
return skb;
}
-/* Called with RTNL lock or RCU read lock. */
-static struct vport *lookup_vport(struct ovs_header *ovs_header,
+/* Called with ovs_mutex or RCU read lock. */
+static struct vport *lookup_vport(struct net *net,
+ struct ovs_header *ovs_header,
struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
{
struct datapath *dp;
struct vport *vport;
if (a[OVS_VPORT_ATTR_NAME]) {
- vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
+ vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
if (!vport)
return ERR_PTR(-ENODEV);
if (ovs_header->dp_ifindex &&
@@ -1531,13 +1698,13 @@ static struct vport *lookup_vport(struct ovs_header *ovs_header,
if (port_no >= DP_MAX_PORTS)
return ERR_PTR(-EFBIG);
- dp = get_dp(ovs_header->dp_ifindex);
+ dp = get_dp(net, ovs_header->dp_ifindex);
if (!dp)
return ERR_PTR(-ENODEV);
- vport = rcu_dereference_rtnl(dp->ports[port_no]);
+ vport = ovs_vport_ovsl_rcu(dp, port_no);
if (!vport)
- return ERR_PTR(-ENOENT);
+ return ERR_PTR(-ENODEV);
return vport;
} else
return ERR_PTR(-EINVAL);
@@ -1554,35 +1721,37 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
u32 port_no;
int err;
- err = -EINVAL;
if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
!a[OVS_VPORT_ATTR_UPCALL_PID])
- goto exit;
+ return -EINVAL;
- rtnl_lock();
- dp = get_dp(ovs_header->dp_ifindex);
- err = -ENODEV;
- if (!dp)
- goto exit_unlock;
+ port_no = a[OVS_VPORT_ATTR_PORT_NO]
+ ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
+ if (port_no >= DP_MAX_PORTS)
+ return -EFBIG;
- if (a[OVS_VPORT_ATTR_PORT_NO]) {
- port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
- err = -EFBIG;
- if (port_no >= DP_MAX_PORTS)
- goto exit_unlock;
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ err = -ENODEV;
+ if (!dp)
+ goto exit_unlock_free;
- vport = rtnl_dereference(dp->ports[port_no]);
+ if (port_no) {
+ vport = ovs_vport_ovsl(dp, port_no);
err = -EBUSY;
if (vport)
- goto exit_unlock;
+ goto exit_unlock_free;
} else {
for (port_no = 1; ; port_no++) {
if (port_no >= DP_MAX_PORTS) {
err = -EFBIG;
- goto exit_unlock;
+ goto exit_unlock_free;
}
- vport = rtnl_dereference(dp->ports[port_no]);
+ vport = ovs_vport_ovsl(dp, port_no);
if (!vport)
break;
}
@@ -1593,26 +1762,24 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
parms.options = a[OVS_VPORT_ATTR_OPTIONS];
parms.dp = dp;
parms.port_no = port_no;
- parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
+ parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
vport = new_vport(&parms);
err = PTR_ERR(vport);
if (IS_ERR(vport))
- goto exit_unlock;
+ goto exit_unlock_free;
- reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
- OVS_VPORT_CMD_NEW);
- if (IS_ERR(reply)) {
- err = PTR_ERR(reply);
- ovs_dp_detach_port(vport);
- goto exit_unlock;
- }
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+ BUG_ON(err < 0);
+ ovs_unlock();
+
+ ovs_notify(&dp_vport_genl_family, reply, info);
+ return 0;
-exit_unlock:
- rtnl_unlock();
-exit:
+exit_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
return err;
}
@@ -1623,36 +1790,42 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
struct vport *vport;
int err;
- rtnl_lock();
- vport = lookup_vport(info->userhdr, a);
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
- goto exit_unlock;
+ goto exit_unlock_free;
- err = 0;
if (a[OVS_VPORT_ATTR_TYPE] &&
- nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type)
+ nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
err = -EINVAL;
+ goto exit_unlock_free;
+ }
- if (!err && a[OVS_VPORT_ATTR_OPTIONS])
+ if (a[OVS_VPORT_ATTR_OPTIONS]) {
err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
- if (!err && a[OVS_VPORT_ATTR_UPCALL_PID])
- vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
-
- reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
- OVS_VPORT_CMD_NEW);
- if (IS_ERR(reply)) {
- err = PTR_ERR(reply);
- netlink_set_err(init_net.genl_sock, 0,
- ovs_dp_vport_multicast_group.id, err);
- return 0;
+ if (err)
+ goto exit_unlock_free;
}
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
+ if (a[OVS_VPORT_ATTR_UPCALL_PID])
+ vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
-exit_unlock:
- rtnl_unlock();
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+ BUG_ON(err < 0);
+
+ ovs_unlock();
+ ovs_notify(&dp_vport_genl_family, reply, info);
+ return 0;
+
+exit_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
return err;
}
@@ -1663,30 +1836,33 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct vport *vport;
int err;
- rtnl_lock();
- vport = lookup_vport(info->userhdr, a);
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
- goto exit_unlock;
+ goto exit_unlock_free;
if (vport->port_no == OVSP_LOCAL) {
err = -EINVAL;
- goto exit_unlock;
+ goto exit_unlock_free;
}
- reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
- OVS_VPORT_CMD_DEL);
- err = PTR_ERR(reply);
- if (IS_ERR(reply))
- goto exit_unlock;
-
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_DEL);
+ BUG_ON(err < 0);
ovs_dp_detach_port(vport);
+ ovs_unlock();
- genl_notify(reply, genl_info_net(info), info->snd_pid,
- ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
+ ovs_notify(&dp_vport_genl_family, reply, info);
+ return 0;
-exit_unlock:
- rtnl_unlock();
+exit_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
return err;
}
@@ -1698,24 +1874,25 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
struct vport *vport;
int err;
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
rcu_read_lock();
- vport = lookup_vport(ovs_header, a);
+ vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
- goto exit_unlock;
-
- reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
- OVS_VPORT_CMD_NEW);
- err = PTR_ERR(reply);
- if (IS_ERR(reply))
- goto exit_unlock;
-
+ goto exit_unlock_free;
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+ BUG_ON(err < 0);
rcu_read_unlock();
return genlmsg_reply(reply, info);
-exit_unlock:
+exit_unlock_free:
rcu_read_unlock();
+ kfree_skb(reply);
return err;
}
@@ -1723,55 +1900,49 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
struct datapath *dp;
- u32 port_no;
- int retval;
-
- dp = get_dp(ovs_header->dp_ifindex);
- if (!dp)
- return -ENODEV;
+ int bucket = cb->args[0], skip = cb->args[1];
+ int i, j = 0;
rcu_read_lock();
- for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) {
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
- vport = rcu_dereference(dp->ports[port_no]);
- if (!vport)
- continue;
-
- if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- OVS_VPORT_CMD_NEW) < 0)
- break;
+ j = 0;
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+ if (j >= skip &&
+ ovs_vport_cmd_fill_info(vport, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ OVS_VPORT_CMD_NEW) < 0)
+ goto out;
+
+ j++;
+ }
+ skip = 0;
}
+out:
rcu_read_unlock();
- cb->args[0] = port_no;
- retval = skb->len;
+ cb->args[0] = i;
+ cb->args[1] = j;
- return retval;
+ return skb->len;
}
-static void rehash_flow_table(struct work_struct *work)
-{
- struct datapath *dp;
-
- genl_lock();
-
- list_for_each_entry(dp, &dps, list_node) {
- struct flow_table *old_table = genl_dereference(dp->table);
- struct flow_table *new_table;
-
- new_table = ovs_flow_tbl_rehash(old_table);
- if (!IS_ERR(new_table)) {
- rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_deferred_destroy(old_table);
- }
- }
-
- genl_unlock();
-
- schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
-}
+static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
+ [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+ [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
+ [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
+};
static struct genl_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
@@ -1797,26 +1968,25 @@ static struct genl_ops dp_vport_genl_ops[] = {
},
};
-struct genl_family_and_ops {
- struct genl_family *family;
- struct genl_ops *ops;
- int n_ops;
- struct genl_multicast_group *group;
+struct genl_family dp_vport_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_VPORT_FAMILY,
+ .version = OVS_VPORT_VERSION,
+ .maxattr = OVS_VPORT_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_vport_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .mcgrps = &ovs_dp_vport_multicast_group,
+ .n_mcgrps = 1,
};
-static const struct genl_family_and_ops dp_genl_families[] = {
- { &dp_datapath_genl_family,
- dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
- &ovs_dp_datapath_multicast_group },
- { &dp_vport_genl_family,
- dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
- &ovs_dp_vport_multicast_group },
- { &dp_flow_genl_family,
- dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
- &ovs_dp_flow_multicast_group },
- { &dp_packet_genl_family,
- dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
- NULL },
+static struct genl_family * const dp_genl_families[] = {
+ &dp_datapath_genl_family,
+ &dp_vport_genl_family,
+ &dp_flow_genl_family,
+ &dp_packet_genl_family,
};
static void dp_unregister_genl(int n_families)
@@ -1824,45 +1994,62 @@ static void dp_unregister_genl(int n_families)
int i;
for (i = 0; i < n_families; i++)
- genl_unregister_family(dp_genl_families[i].family);
+ genl_unregister_family(dp_genl_families[i]);
}
static int dp_register_genl(void)
{
- int n_registered;
int err;
int i;
- n_registered = 0;
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
- const struct genl_family_and_ops *f = &dp_genl_families[i];
- err = genl_register_family_with_ops(f->family, f->ops,
- f->n_ops);
+ err = genl_register_family(dp_genl_families[i]);
if (err)
goto error;
- n_registered++;
-
- if (f->group) {
- err = genl_register_mc_group(f->family, f->group);
- if (err)
- goto error;
- }
}
return 0;
error:
- dp_unregister_genl(n_registered);
+ dp_unregister_genl(i);
return err;
}
+static int __net_init ovs_init_net(struct net *net)
+{
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ INIT_LIST_HEAD(&ovs_net->dps);
+ INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
+ return 0;
+}
+
+static void __net_exit ovs_exit_net(struct net *net)
+{
+ struct datapath *dp, *dp_next;
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ ovs_lock();
+ list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
+ __dp_destroy(dp);
+ ovs_unlock();
+
+ cancel_work_sync(&ovs_net->dp_notify_work);
+}
+
+static struct pernet_operations ovs_net_ops = {
+ .init = ovs_init_net,
+ .exit = ovs_exit_net,
+ .id = &ovs_net_id,
+ .size = sizeof(struct ovs_net),
+};
+
static int __init dp_init(void)
{
- struct sk_buff *dummy_skb;
int err;
- BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath\n");
@@ -1874,20 +2061,24 @@ static int __init dp_init(void)
if (err)
goto error_flow_exit;
- err = register_netdevice_notifier(&ovs_dp_device_notifier);
+ err = register_pernet_device(&ovs_net_ops);
if (err)
goto error_vport_exit;
+ err = register_netdevice_notifier(&ovs_dp_device_notifier);
+ if (err)
+ goto error_netns_exit;
+
err = dp_register_genl();
if (err < 0)
goto error_unreg_notifier;
- schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
-
return 0;
error_unreg_notifier:
unregister_netdevice_notifier(&ovs_dp_device_notifier);
+error_netns_exit:
+ unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
ovs_vport_exit();
error_flow_exit:
@@ -1898,10 +2089,10 @@ error:
static void dp_cleanup(void)
{
- cancel_delayed_work_sync(&rehash_flow_wq);
- rcu_barrier();
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
unregister_netdevice_notifier(&ovs_dp_device_notifier);
+ unregister_pernet_device(&ovs_net_ops);
+ rcu_barrier();
ovs_vport_exit();
ovs_flow_exit();
}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index c73370cc1f0..7ede507500d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -27,10 +27,12 @@
#include <linux/u64_stats_sync.h>
#include "flow.h"
+#include "flow_table.h"
+#include "vport.h"
-struct vport;
+#define DP_MAX_PORTS USHRT_MAX
+#define DP_VPORT_HASH_BUCKETS 1024
-#define DP_MAX_PORTS 1024
#define SAMPLE_ACTION_DEPTH 3
/**
@@ -44,25 +46,27 @@ struct vport;
* @n_lost: Number of received packets that had no matching flow in the flow
* table that could not be sent to userspace (normally due to an overflow in
* one of the datapath's queues).
+ * @n_mask_hit: Number of masks looked up for flow match.
+ * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked
+ * up per packet.
*/
struct dp_stats_percpu {
u64 n_hit;
u64 n_missed;
u64 n_lost;
- struct u64_stats_sync sync;
+ u64 n_mask_hit;
+ struct u64_stats_sync syncp;
};
/**
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
- * @n_flows: Number of flows currently in flow table.
- * @table: Current flow table. Protected by genl_lock and RCU.
- * @ports: Map from port number to &struct vport. %OVSP_LOCAL port
- * always exists, other ports may be %NULL. Protected by RTNL and RCU.
- * @port_list: List of all ports in @ports in arbitrary order. RTNL required
- * to iterate or modify.
+ * @table: flow table.
+ * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
+ * ovs_mutex and RCU.
* @stats_percpu: Per-CPU datapath statistics.
+ * @net: Reference to net namespace.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
@@ -72,22 +76,33 @@ struct datapath {
struct list_head list_node;
/* Flow table. */
- struct flow_table __rcu *table;
+ struct flow_table table;
/* Switch ports. */
- struct vport __rcu *ports[DP_MAX_PORTS];
- struct list_head port_list;
+ struct hlist_head *ports;
/* Stats. */
struct dp_stats_percpu __percpu *stats_percpu;
+
+#ifdef CONFIG_NET_NS
+ /* Network namespace ref. */
+ struct net *net;
+#endif
+
+ u32 user_features;
};
/**
* struct ovs_skb_cb - OVS data in skb CB
* @flow: The flow associated with this packet. May be %NULL if no flow.
+ * @pkt_key: The flow information extracted from the packet. Must be nonnull.
+ * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
+ * packet is not being tunneled.
*/
struct ovs_skb_cb {
struct sw_flow *flow;
+ struct sw_flow_key *pkt_key;
+ struct ovs_key_ipv4_tunnel *tun_key;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -95,7 +110,7 @@ struct ovs_skb_cb {
* struct dp_upcall - metadata to include with a packet to send to userspace
* @cmd: One of %OVS_PACKET_CMD_*.
* @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull.
- * @userdata: If nonnull, its u64 value is extracted and passed to userspace as
+ * @userdata: If nonnull, its variable-length value is passed to userspace as
* %OVS_PACKET_ATTR_USERDATA.
* @pid: Netlink PID to which packet should be sent. If @pid is 0 then no
* packet is sent and the packet is accounted in the datapath's @n_lost
@@ -105,20 +120,83 @@ struct dp_upcall_info {
u8 cmd;
const struct sw_flow_key *key;
const struct nlattr *userdata;
- u32 pid;
+ u32 portid;
+};
+
+/**
+ * struct ovs_net - Per net-namespace data for ovs.
+ * @dps: List of datapaths to enable dumping them all out.
+ * Protected by genl_mutex.
+ */
+struct ovs_net {
+ struct list_head dps;
+ struct work_struct dp_notify_work;
+ struct vport_net vport_net;
};
+extern int ovs_net_id;
+void ovs_lock(void);
+void ovs_unlock(void);
+
+#ifdef CONFIG_LOCKDEP
+int lockdep_ovsl_is_held(void);
+#else
+#define lockdep_ovsl_is_held() 1
+#endif
+
+#define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held()))
+#define ovsl_dereference(p) \
+ rcu_dereference_protected(p, lockdep_ovsl_is_held())
+#define rcu_dereference_ovsl(p) \
+ rcu_dereference_check(p, lockdep_ovsl_is_held())
+
+static inline struct net *ovs_dp_get_net(struct datapath *dp)
+{
+ return read_pnet(&dp->net);
+}
+
+static inline void ovs_dp_set_net(struct datapath *dp, struct net *net)
+{
+ write_pnet(&dp->net, net);
+}
+
+struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no);
+
+static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return ovs_lookup_vport(dp, port_no);
+}
+
+static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+ return ovs_lookup_vport(dp, port_no);
+}
+
+static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no)
+{
+ ASSERT_OVSL();
+ return ovs_lookup_vport(dp, port_no);
+}
+
extern struct notifier_block ovs_dp_device_notifier;
-extern struct genl_multicast_group ovs_dp_vport_multicast_group;
+extern struct genl_family dp_vport_genl_family;
void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
void ovs_dp_detach_port(struct vport *);
int ovs_dp_upcall(struct datapath *, struct sk_buff *,
const struct dp_upcall_info *);
-const char *ovs_dp_name(const struct datapath *dp);
struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
u8 cmd);
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
+void ovs_dp_notify_wq(struct work_struct *work);
+
+#define OVS_NLERR(fmt, ...) \
+do { \
+ if (net_ratelimit()) \
+ pr_info("netlink: " fmt, ##__VA_ARGS__); \
+} while (0)
#endif /* datapath.h */
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 46736518c45..2c631fe76be 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -18,44 +18,80 @@
#include <linux/netdevice.h>
#include <net/genetlink.h>
+#include <net/netns/generic.h>
#include "datapath.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
+static void dp_detach_port_notify(struct vport *vport)
+{
+ struct sk_buff *notify;
+ struct datapath *dp;
+
+ dp = vport->dp;
+ notify = ovs_vport_cmd_build_info(vport, 0, 0,
+ OVS_VPORT_CMD_DEL);
+ ovs_dp_detach_port(vport);
+ if (IS_ERR(notify)) {
+ genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0,
+ 0, PTR_ERR(notify));
+ return;
+ }
+
+ genlmsg_multicast_netns(&dp_vport_genl_family,
+ ovs_dp_get_net(dp), notify, 0,
+ 0, GFP_KERNEL);
+}
+
+void ovs_dp_notify_wq(struct work_struct *work)
+{
+ struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work);
+ struct datapath *dp;
+
+ ovs_lock();
+ list_for_each_entry(dp, &ovs_net->dps, list_node) {
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ struct vport *vport;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
+ struct netdev_vport *netdev_vport;
+
+ if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
+ continue;
+
+ netdev_vport = netdev_vport_priv(vport);
+ if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
+ dp_detach_port_notify(vport);
+ }
+ }
+ }
+ ovs_unlock();
+}
+
static int dp_device_event(struct notifier_block *unused, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
- struct vport *vport;
+ struct ovs_net *ovs_net;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct vport *vport = NULL;
- if (ovs_is_internal_dev(dev))
- vport = ovs_internal_dev_get_vport(dev);
- else
+ if (!ovs_is_internal_dev(dev))
vport = ovs_netdev_get_vport(dev);
if (!vport)
return NOTIFY_DONE;
- switch (event) {
- case NETDEV_UNREGISTER:
- if (!ovs_is_internal_dev(dev)) {
- struct sk_buff *notify;
-
- notify = ovs_vport_cmd_build_info(vport, 0, 0,
- OVS_VPORT_CMD_DEL);
- ovs_dp_detach_port(vport);
- if (IS_ERR(notify)) {
- netlink_set_err(init_net.genl_sock, 0,
- ovs_dp_vport_multicast_group.id,
- PTR_ERR(notify));
- break;
- }
+ if (event == NETDEV_UNREGISTER) {
+ /* upper_dev_unlink and decrement promisc immediately */
+ ovs_netdev_detach_dev(vport);
- genlmsg_multicast(notify, 0, ovs_dp_vport_multicast_group.id,
- GFP_KERNEL);
- }
- break;
+ /* schedule vport destroy, dev_put and genl notification */
+ ovs_net = net_generic(dev_net(dev), ovs_net_id);
+ queue_work(system_wq, &ovs_net->dp_notify_work);
}
return NOTIFY_DONE;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 1252c3081ef..d07ab538fc9 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -34,16 +34,141 @@
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/smp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
-static struct kmem_cache *flow_cache;
+u64 ovs_flow_used_time(unsigned long flow_jiffies)
+{
+ struct timespec cur_ts;
+ u64 cur_ms, idle_ms;
+
+ ktime_get_ts(&cur_ts);
+ idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
+ cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+ cur_ts.tv_nsec / NSEC_PER_MSEC;
+
+ return cur_ms - idle_ms;
+}
+
+#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
+
+void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
+ struct sk_buff *skb)
+{
+ struct flow_stats *stats;
+ int node = numa_node_id();
+
+ stats = rcu_dereference(flow->stats[node]);
+
+ /* Check if already have node-specific stats. */
+ if (likely(stats)) {
+ spin_lock(&stats->lock);
+ /* Mark if we write on the pre-allocated stats. */
+ if (node == 0 && unlikely(flow->stats_last_writer != node))
+ flow->stats_last_writer = node;
+ } else {
+ stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
+ spin_lock(&stats->lock);
+
+ /* If the current NUMA-node is the only writer on the
+ * pre-allocated stats keep using them.
+ */
+ if (unlikely(flow->stats_last_writer != node)) {
+ /* A previous locker may have already allocated the
+ * stats, so we need to check again. If node-specific
+ * stats were already allocated, we update the pre-
+ * allocated stats as we have already locked them.
+ */
+ if (likely(flow->stats_last_writer != NUMA_NO_NODE)
+ && likely(!rcu_dereference(flow->stats[node]))) {
+ /* Try to allocate node-specific stats. */
+ struct flow_stats *new_stats;
+
+ new_stats =
+ kmem_cache_alloc_node(flow_stats_cache,
+ GFP_THISNODE |
+ __GFP_NOMEMALLOC,
+ node);
+ if (likely(new_stats)) {
+ new_stats->used = jiffies;
+ new_stats->packet_count = 1;
+ new_stats->byte_count = skb->len;
+ new_stats->tcp_flags = tcp_flags;
+ spin_lock_init(&new_stats->lock);
+
+ rcu_assign_pointer(flow->stats[node],
+ new_stats);
+ goto unlock;
+ }
+ }
+ flow->stats_last_writer = node;
+ }
+ }
+
+ stats->used = jiffies;
+ stats->packet_count++;
+ stats->byte_count += skb->len;
+ stats->tcp_flags |= tcp_flags;
+unlock:
+ spin_unlock(&stats->lock);
+}
+
+/* Must be called with rcu_read_lock or ovs_mutex. */
+void ovs_flow_stats_get(const struct sw_flow *flow,
+ struct ovs_flow_stats *ovs_stats,
+ unsigned long *used, __be16 *tcp_flags)
+{
+ int node;
+
+ *used = 0;
+ *tcp_flags = 0;
+ memset(ovs_stats, 0, sizeof(*ovs_stats));
+
+ for_each_node(node) {
+ struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+
+ if (stats) {
+ /* Local CPU may write on non-local stats, so we must
+ * block bottom-halves here.
+ */
+ spin_lock_bh(&stats->lock);
+ if (!*used || time_after(stats->used, *used))
+ *used = stats->used;
+ *tcp_flags |= stats->tcp_flags;
+ ovs_stats->n_packets += stats->packet_count;
+ ovs_stats->n_bytes += stats->byte_count;
+ spin_unlock_bh(&stats->lock);
+ }
+ }
+}
+
+/* Called with ovs_mutex. */
+void ovs_flow_stats_clear(struct sw_flow *flow)
+{
+ int node;
+
+ for_each_node(node) {
+ struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+
+ if (stats) {
+ spin_lock_bh(&stats->lock);
+ stats->used = 0;
+ stats->packet_count = 0;
+ stats->byte_count = 0;
+ stats->tcp_flags = 0;
+ spin_unlock_bh(&stats->lock);
+ }
+ }
+}
static int check_header(struct sk_buff *skb, int len)
{
@@ -101,31 +226,19 @@ static bool udphdr_ok(struct sk_buff *skb)
sizeof(struct udphdr));
}
-static bool icmphdr_ok(struct sk_buff *skb)
+static bool sctphdr_ok(struct sk_buff *skb)
{
return pskb_may_pull(skb, skb_transport_offset(skb) +
- sizeof(struct icmphdr));
+ sizeof(struct sctphdr));
}
-u64 ovs_flow_used_time(unsigned long flow_jiffies)
+static bool icmphdr_ok(struct sk_buff *skb)
{
- struct timespec cur_ts;
- u64 cur_ms, idle_ms;
-
- ktime_get_ts(&cur_ts);
- idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
- cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
- cur_ts.tv_nsec / NSEC_PER_MSEC;
-
- return cur_ms - idle_ms;
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct icmphdr));
}
-#define SW_FLOW_KEY_OFFSET(field) \
- (offsetof(struct sw_flow_key, field) + \
- FIELD_SIZEOF(struct sw_flow_key, field))
-
-static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
- int *key_lenp)
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
{
unsigned int nh_ofs = skb_network_offset(skb);
unsigned int nh_len;
@@ -135,8 +248,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
__be16 frag_off;
int err;
- *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
-
err = check_header(skb, nh_ofs + sizeof(*nh));
if (unlikely(err))
return err;
@@ -175,271 +286,6 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
sizeof(struct icmp6hdr));
}
-#define TCP_FLAGS_OFFSET 13
-#define TCP_FLAG_MASK 0x3f
-
-void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
-{
- u8 tcp_flags = 0;
-
- if (flow->key.eth.type == htons(ETH_P_IP) &&
- flow->key.ip.proto == IPPROTO_TCP) {
- u8 *tcp = (u8 *)tcp_hdr(skb);
- tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
- }
-
- spin_lock(&flow->lock);
- flow->used = jiffies;
- flow->packet_count++;
- flow->byte_count += skb->len;
- flow->tcp_flags |= tcp_flags;
- spin_unlock(&flow->lock);
-}
-
-struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
-{
- int actions_len = nla_len(actions);
- struct sw_flow_actions *sfa;
-
- /* At least DP_MAX_PORTS actions are required to be able to flood a
- * packet to every port. Factor of 2 allows for setting VLAN tags,
- * etc. */
- if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4))
- return ERR_PTR(-EINVAL);
-
- sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
- if (!sfa)
- return ERR_PTR(-ENOMEM);
-
- sfa->actions_len = actions_len;
- memcpy(sfa->actions, nla_data(actions), actions_len);
- return sfa;
-}
-
-struct sw_flow *ovs_flow_alloc(void)
-{
- struct sw_flow *flow;
-
- flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
- if (!flow)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_init(&flow->lock);
- flow->sf_acts = NULL;
-
- return flow;
-}
-
-static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
-{
- hash = jhash_1word(hash, table->hash_seed);
- return flex_array_get(table->buckets,
- (hash & (table->n_buckets - 1)));
-}
-
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
- struct flex_array *buckets;
- int i, err;
-
- buckets = flex_array_alloc(sizeof(struct hlist_head *),
- n_buckets, GFP_KERNEL);
- if (!buckets)
- return NULL;
-
- err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
- if (err) {
- flex_array_free(buckets);
- return NULL;
- }
-
- for (i = 0; i < n_buckets; i++)
- INIT_HLIST_HEAD((struct hlist_head *)
- flex_array_get(buckets, i));
-
- return buckets;
-}
-
-static void free_buckets(struct flex_array *buckets)
-{
- flex_array_free(buckets);
-}
-
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
-{
- struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
-
- if (!table)
- return NULL;
-
- table->buckets = alloc_buckets(new_size);
-
- if (!table->buckets) {
- kfree(table);
- return NULL;
- }
- table->n_buckets = new_size;
- table->count = 0;
- table->node_ver = 0;
- table->keep_flows = false;
- get_random_bytes(&table->hash_seed, sizeof(u32));
-
- return table;
-}
-
-void ovs_flow_tbl_destroy(struct flow_table *table)
-{
- int i;
-
- if (!table)
- return;
-
- if (table->keep_flows)
- goto skip_flows;
-
- for (i = 0; i < table->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head = flex_array_get(table->buckets, i);
- struct hlist_node *node, *n;
- int ver = table->node_ver;
-
- hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
- hlist_del_rcu(&flow->hash_node[ver]);
- ovs_flow_free(flow);
- }
- }
-
-skip_flows:
- free_buckets(table->buckets);
- kfree(table);
-}
-
-static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
-{
- struct flow_table *table = container_of(rcu, struct flow_table, rcu);
-
- ovs_flow_tbl_destroy(table);
-}
-
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
-{
- if (!table)
- return;
-
- call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
-}
-
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
-{
- struct sw_flow *flow;
- struct hlist_head *head;
- struct hlist_node *n;
- int ver;
- int i;
-
- ver = table->node_ver;
- while (*bucket < table->n_buckets) {
- i = 0;
- head = flex_array_get(table->buckets, *bucket);
- hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) {
- if (i < *last) {
- i++;
- continue;
- }
- *last = i + 1;
- return flow;
- }
- (*bucket)++;
- *last = 0;
- }
-
- return NULL;
-}
-
-static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
-{
- int old_ver;
- int i;
-
- old_ver = old->node_ver;
- new->node_ver = !old_ver;
-
- /* Insert in new table. */
- for (i = 0; i < old->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head;
- struct hlist_node *n;
-
- head = flex_array_get(old->buckets, i);
-
- hlist_for_each_entry(flow, n, head, hash_node[old_ver])
- ovs_flow_tbl_insert(new, flow);
- }
- old->keep_flows = true;
-}
-
-static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
-{
- struct flow_table *new_table;
-
- new_table = ovs_flow_tbl_alloc(n_buckets);
- if (!new_table)
- return ERR_PTR(-ENOMEM);
-
- flow_table_copy_flows(table, new_table);
-
- return new_table;
-}
-
-struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
-{
- return __flow_tbl_rehash(table, table->n_buckets);
-}
-
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
-{
- return __flow_tbl_rehash(table, table->n_buckets * 2);
-}
-
-void ovs_flow_free(struct sw_flow *flow)
-{
- if (unlikely(!flow))
- return;
-
- kfree((struct sf_flow_acts __force *)flow->sf_acts);
- kmem_cache_free(flow_cache, flow);
-}
-
-/* RCU callback used by ovs_flow_deferred_free. */
-static void rcu_free_flow_callback(struct rcu_head *rcu)
-{
- struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
-
- ovs_flow_free(flow);
-}
-
-/* Schedules 'flow' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free(struct sw_flow *flow)
-{
- call_rcu(&flow->rcu, rcu_free_flow_callback);
-}
-
-/* RCU callback used by ovs_flow_deferred_free_acts. */
-static void rcu_free_acts_callback(struct rcu_head *rcu)
-{
- struct sw_flow_actions *sf_acts = container_of(rcu,
- struct sw_flow_actions, rcu);
- kfree(sf_acts);
-}
-
-/* Schedules 'sf_acts' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
-{
- call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
-}
-
static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
{
struct qtag_prefix {
@@ -477,7 +323,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
proto = *(__be16 *) skb->data;
__skb_pull(skb, sizeof(__be16));
- if (ntohs(proto) >= 1536)
+ if (ntohs(proto) >= ETH_P_802_3_MIN)
return proto;
if (skb->len < sizeof(struct llc_snap_hdr))
@@ -493,22 +339,23 @@ static __be16 parse_ethertype(struct sk_buff *skb)
return htons(ETH_P_802_2);
__skb_pull(skb, sizeof(struct llc_snap_hdr));
- return llc->ethertype;
+
+ if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
+ return llc->ethertype;
+
+ return htons(ETH_P_802_2);
}
static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
- int *key_lenp, int nh_len)
+ int nh_len)
{
struct icmp6hdr *icmp = icmp6_hdr(skb);
- int error = 0;
- int key_len;
/* The ICMPv6 type and code fields use the 16-bit transport port
* fields, so we need to store them in 16-bit network byte order.
*/
- key->ipv6.tp.src = htons(icmp->icmp6_type);
- key->ipv6.tp.dst = htons(icmp->icmp6_code);
- key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ key->tp.src = htons(icmp->icmp6_type);
+ key->tp.dst = htons(icmp->icmp6_code);
if (icmp->icmp6_code == 0 &&
(icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
@@ -517,21 +364,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
struct nd_msg *nd;
int offset;
- key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-
/* In order to process neighbor discovery options, we need the
* entire packet.
*/
if (unlikely(icmp_len < sizeof(*nd)))
- goto out;
- if (unlikely(skb_linearize(skb))) {
- error = -ENOMEM;
- goto out;
- }
+ return 0;
+
+ if (unlikely(skb_linearize(skb)))
+ return -ENOMEM;
nd = (struct nd_msg *)skb_transport_header(skb);
key->ipv6.nd.target = nd->target;
- key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
icmp_len -= sizeof(*nd);
offset = 0;
@@ -541,7 +384,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
int opt_len = nd_opt->nd_opt_len * 8;
if (unlikely(!opt_len || opt_len > icmp_len))
- goto invalid;
+ return 0;
/* Store the link layer address if the appropriate
* option is provided. It is considered an error if
@@ -551,14 +394,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
&& opt_len == 8) {
if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
goto invalid;
- memcpy(key->ipv6.nd.sll,
- &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
+ ether_addr_copy(key->ipv6.nd.sll,
+ &nd->opt[offset+sizeof(*nd_opt)]);
} else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
&& opt_len == 8) {
if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
goto invalid;
- memcpy(key->ipv6.nd.tll,
- &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
+ ether_addr_copy(key->ipv6.nd.tll,
+ &nd->opt[offset+sizeof(*nd_opt)]);
}
icmp_len -= opt_len;
@@ -566,16 +409,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
}
}
- goto out;
+ return 0;
invalid:
memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
-out:
- *key_lenp = key_len;
- return error;
+ return 0;
}
/**
@@ -584,7 +425,6 @@ out:
* Ethernet header
* @in_port: port number on which @skb was received.
* @key: output flow key
- * @key_lenp: length of output flow key
*
* The caller must ensure that skb->len >= ETH_HLEN.
*
@@ -597,22 +437,23 @@ out:
* - skb->network_header: just past the Ethernet header, or just past the
* VLAN header, to the first byte of the Ethernet payload.
*
- * - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6
+ * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
* on output, then just past the IP header, if one is present and
* of a correct length, otherwise the same as skb->network_header.
- * For other key->dl_type values it is left untouched.
+ * For other key->eth.type values it is left untouched.
*/
-int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
- int *key_lenp)
+int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
{
- int error = 0;
- int key_len = SW_FLOW_KEY_OFFSET(eth);
+ int error;
struct ethhdr *eth;
memset(key, 0, sizeof(*key));
key->phy.priority = skb->priority;
+ if (OVS_CB(skb)->tun_key)
+ memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
key->phy.in_port = in_port;
+ key->phy.skb_mark = skb->mark;
skb_reset_mac_header(skb);
@@ -620,10 +461,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
* header in the linear data area.
*/
eth = eth_hdr(skb);
- memcpy(key->eth.src, eth->h_source, ETH_ALEN);
- memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
+ ether_addr_copy(key->eth.src, eth->h_source);
+ ether_addr_copy(key->eth.dst, eth->h_dest);
__skb_pull(skb, 2 * ETH_ALEN);
+ /* We are going to push all headers that we pull, so no need to
+ * update skb->csum here.
+ */
if (vlan_tx_tag_present(skb))
key->eth.tci = htons(skb->vlan_tci);
@@ -643,15 +487,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
struct iphdr *nh;
__be16 offset;
- key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
-
error = check_iphdr(skb);
if (unlikely(error)) {
if (error == -EINVAL) {
skb->transport_header = skb->network_header;
error = 0;
}
- goto out;
+ return error;
}
nh = ip_hdr(skb);
@@ -665,7 +507,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
offset = nh->frag_off & htons(IP_OFFSET);
if (offset) {
key->ip.frag = OVS_FRAG_TYPE_LATER;
- goto out;
+ return 0;
}
if (nh->frag_off & htons(IP_MF) ||
skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
@@ -673,32 +515,37 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
/* Transport layer. */
if (key->ip.proto == IPPROTO_TCP) {
- key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
if (tcphdr_ok(skb)) {
struct tcphdr *tcp = tcp_hdr(skb);
- key->ipv4.tp.src = tcp->source;
- key->ipv4.tp.dst = tcp->dest;
+ key->tp.src = tcp->source;
+ key->tp.dst = tcp->dest;
+ key->tp.flags = TCP_FLAGS_BE16(tcp);
}
} else if (key->ip.proto == IPPROTO_UDP) {
- key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
if (udphdr_ok(skb)) {
struct udphdr *udp = udp_hdr(skb);
- key->ipv4.tp.src = udp->source;
- key->ipv4.tp.dst = udp->dest;
+ key->tp.src = udp->source;
+ key->tp.dst = udp->dest;
+ }
+ } else if (key->ip.proto == IPPROTO_SCTP) {
+ if (sctphdr_ok(skb)) {
+ struct sctphdr *sctp = sctp_hdr(skb);
+ key->tp.src = sctp->source;
+ key->tp.dst = sctp->dest;
}
} else if (key->ip.proto == IPPROTO_ICMP) {
- key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
if (icmphdr_ok(skb)) {
struct icmphdr *icmp = icmp_hdr(skb);
/* The ICMP type and code fields use the 16-bit
* transport port fields, so we need to store
* them in 16-bit network byte order. */
- key->ipv4.tp.src = htons(icmp->type);
- key->ipv4.tp.dst = htons(icmp->code);
+ key->tp.src = htons(icmp->type);
+ key->tp.dst = htons(icmp->code);
}
}
- } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
+ } else if ((key->eth.type == htons(ETH_P_ARP) ||
+ key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
struct arp_eth_header *arp;
arp = (struct arp_eth_header *)skb_network_header(skb);
@@ -711,635 +558,58 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
/* We only match on the lower 8 bits of the opcode. */
if (ntohs(arp->ar_op) <= 0xff)
key->ip.proto = ntohs(arp->ar_op);
-
- if (key->ip.proto == ARPOP_REQUEST
- || key->ip.proto == ARPOP_REPLY) {
- memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
- memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
- memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
- memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
- key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
- }
+ memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
+ memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
+ ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
+ ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
}
} else if (key->eth.type == htons(ETH_P_IPV6)) {
int nh_len; /* IPv6 Header + Extensions */
- nh_len = parse_ipv6hdr(skb, key, &key_len);
+ nh_len = parse_ipv6hdr(skb, key);
if (unlikely(nh_len < 0)) {
- if (nh_len == -EINVAL)
+ if (nh_len == -EINVAL) {
skb->transport_header = skb->network_header;
- else
+ error = 0;
+ } else {
error = nh_len;
- goto out;
+ }
+ return error;
}
if (key->ip.frag == OVS_FRAG_TYPE_LATER)
- goto out;
+ return 0;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
key->ip.frag = OVS_FRAG_TYPE_FIRST;
/* Transport layer. */
if (key->ip.proto == NEXTHDR_TCP) {
- key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
if (tcphdr_ok(skb)) {
struct tcphdr *tcp = tcp_hdr(skb);
- key->ipv6.tp.src = tcp->source;
- key->ipv6.tp.dst = tcp->dest;
+ key->tp.src = tcp->source;
+ key->tp.dst = tcp->dest;
+ key->tp.flags = TCP_FLAGS_BE16(tcp);
}
} else if (key->ip.proto == NEXTHDR_UDP) {
- key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
if (udphdr_ok(skb)) {
struct udphdr *udp = udp_hdr(skb);
- key->ipv6.tp.src = udp->source;
- key->ipv6.tp.dst = udp->dest;
+ key->tp.src = udp->source;
+ key->tp.dst = udp->dest;
+ }
+ } else if (key->ip.proto == NEXTHDR_SCTP) {
+ if (sctphdr_ok(skb)) {
+ struct sctphdr *sctp = sctp_hdr(skb);
+ key->tp.src = sctp->source;
+ key->tp.dst = sctp->dest;
}
} else if (key->ip.proto == NEXTHDR_ICMP) {
- key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
if (icmp6hdr_ok(skb)) {
- error = parse_icmpv6(skb, key, &key_len, nh_len);
- if (error < 0)
- goto out;
- }
- }
- }
-
-out:
- *key_lenp = key_len;
- return error;
-}
-
-u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
-{
- return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
-}
-
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
- struct sw_flow_key *key, int key_len)
-{
- struct sw_flow *flow;
- struct hlist_node *n;
- struct hlist_head *head;
- u32 hash;
-
- hash = ovs_flow_hash(key, key_len);
-
- head = find_bucket(table, hash);
- hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
-
- if (flow->hash == hash &&
- !memcmp(&flow->key, key, key_len)) {
- return flow;
- }
- }
- return NULL;
-}
-
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
-{
- struct hlist_head *head;
-
- head = find_bucket(table, flow->hash);
- hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
- table->count++;
-}
-
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
-{
- hlist_del_rcu(&flow->hash_node[table->node_ver]);
- table->count--;
- BUG_ON(table->count < 0);
-}
-
-/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
-const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
- [OVS_KEY_ATTR_ENCAP] = -1,
- [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
- [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
- [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
- [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
- [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
- [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
- [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
- [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
- [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
- [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
- [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
- [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
- [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
-};
-
-static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
- const struct nlattr *a[], u32 *attrs)
-{
- const struct ovs_key_icmp *icmp_key;
- const struct ovs_key_tcp *tcp_key;
- const struct ovs_key_udp *udp_key;
-
- switch (swkey->ip.proto) {
- case IPPROTO_TCP:
- if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
- tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
- swkey->ipv4.tp.src = tcp_key->tcp_src;
- swkey->ipv4.tp.dst = tcp_key->tcp_dst;
- break;
-
- case IPPROTO_UDP:
- if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
- udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
- swkey->ipv4.tp.src = udp_key->udp_src;
- swkey->ipv4.tp.dst = udp_key->udp_dst;
- break;
-
- case IPPROTO_ICMP:
- if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
- icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
- swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
- swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
- break;
- }
-
- return 0;
-}
-
-static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
- const struct nlattr *a[], u32 *attrs)
-{
- const struct ovs_key_icmpv6 *icmpv6_key;
- const struct ovs_key_tcp *tcp_key;
- const struct ovs_key_udp *udp_key;
-
- switch (swkey->ip.proto) {
- case IPPROTO_TCP:
- if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
- tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
- swkey->ipv6.tp.src = tcp_key->tcp_src;
- swkey->ipv6.tp.dst = tcp_key->tcp_dst;
- break;
-
- case IPPROTO_UDP:
- if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
- udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
- swkey->ipv6.tp.src = udp_key->udp_src;
- swkey->ipv6.tp.dst = udp_key->udp_dst;
- break;
-
- case IPPROTO_ICMPV6:
- if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
- icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
- swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
- swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
-
- if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
- swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
- const struct ovs_key_nd *nd_key;
-
- if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_ND);
-
- *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
- nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
- memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
- sizeof(swkey->ipv6.nd.target));
- memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
- memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
- }
- break;
- }
-
- return 0;
-}
-
-static int parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u32 *attrsp)
-{
- const struct nlattr *nla;
- u32 attrs;
- int rem;
-
- attrs = 0;
- nla_for_each_nested(nla, attr, rem) {
- u16 type = nla_type(nla);
- int expected_len;
-
- if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type))
- return -EINVAL;
-
- expected_len = ovs_key_lens[type];
- if (nla_len(nla) != expected_len && expected_len != -1)
- return -EINVAL;
-
- attrs |= 1 << type;
- a[type] = nla;
- }
- if (rem)
- return -EINVAL;
-
- *attrsp = attrs;
- return 0;
-}
-
-/**
- * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
- * @swkey: receives the extracted flow key.
- * @key_lenp: number of bytes used in @swkey.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- */
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
- const struct nlattr *attr)
-{
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- const struct ovs_key_ethernet *eth_key;
- int key_len;
- u32 attrs;
- int err;
-
- memset(swkey, 0, sizeof(struct sw_flow_key));
- key_len = SW_FLOW_KEY_OFFSET(eth);
-
- err = parse_flow_nlattrs(attr, a, &attrs);
- if (err)
- return err;
-
- /* Metadata attributes. */
- if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
- swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
- attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
- }
- if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
- u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
- if (in_port >= DP_MAX_PORTS)
- return -EINVAL;
- swkey->phy.in_port = in_port;
- attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
- } else {
- swkey->phy.in_port = USHRT_MAX;
- }
-
- /* Data attributes. */
- if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
- return -EINVAL;
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
-
- eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
- memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
- memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
-
- if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
- nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
- const struct nlattr *encap;
- __be16 tci;
-
- if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
- (1 << OVS_KEY_ATTR_ETHERTYPE) |
- (1 << OVS_KEY_ATTR_ENCAP)))
- return -EINVAL;
-
- encap = a[OVS_KEY_ATTR_ENCAP];
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (tci & htons(VLAN_TAG_PRESENT)) {
- swkey->eth.tci = tci;
-
- err = parse_flow_nlattrs(encap, a, &attrs);
- if (err)
- return err;
- } else if (!tci) {
- /* Corner case for truncated 802.1Q header. */
- if (nla_len(encap))
- return -EINVAL;
-
- swkey->eth.type = htons(ETH_P_8021Q);
- *key_lenp = key_len;
- return 0;
- } else {
- return -EINVAL;
- }
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
- swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (ntohs(swkey->eth.type) < 1536)
- return -EINVAL;
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- } else {
- swkey->eth.type = htons(ETH_P_802_2);
- }
-
- if (swkey->eth.type == htons(ETH_P_IP)) {
- const struct ovs_key_ipv4 *ipv4_key;
-
- if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
- return -EINVAL;
- attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
-
- key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
- ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
- if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
- return -EINVAL;
- swkey->ip.proto = ipv4_key->ipv4_proto;
- swkey->ip.tos = ipv4_key->ipv4_tos;
- swkey->ip.ttl = ipv4_key->ipv4_ttl;
- swkey->ip.frag = ipv4_key->ipv4_frag;
- swkey->ipv4.addr.src = ipv4_key->ipv4_src;
- swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
-
- if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
- err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
- if (err)
- return err;
- }
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- const struct ovs_key_ipv6 *ipv6_key;
-
- if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
- return -EINVAL;
- attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
-
- key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
- ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
- if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
- return -EINVAL;
- swkey->ipv6.label = ipv6_key->ipv6_label;
- swkey->ip.proto = ipv6_key->ipv6_proto;
- swkey->ip.tos = ipv6_key->ipv6_tclass;
- swkey->ip.ttl = ipv6_key->ipv6_hlimit;
- swkey->ip.frag = ipv6_key->ipv6_frag;
- memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
- sizeof(swkey->ipv6.addr.src));
- memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
- sizeof(swkey->ipv6.addr.dst));
-
- if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
- err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
- if (err)
- return err;
- }
- } else if (swkey->eth.type == htons(ETH_P_ARP)) {
- const struct ovs_key_arp *arp_key;
-
- if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
- return -EINVAL;
- attrs &= ~(1 << OVS_KEY_ATTR_ARP);
-
- key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
- arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
- swkey->ipv4.addr.src = arp_key->arp_sip;
- swkey->ipv4.addr.dst = arp_key->arp_tip;
- if (arp_key->arp_op & htons(0xff00))
- return -EINVAL;
- swkey->ip.proto = ntohs(arp_key->arp_op);
- memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
- memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
- }
-
- if (attrs)
- return -EINVAL;
- *key_lenp = key_len;
-
- return 0;
-}
-
-/**
- * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
- * @in_port: receives the extracted input port.
- * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- *
- * This parses a series of Netlink attributes that form a flow key, which must
- * take the same form accepted by flow_from_nlattrs(), but only enough of it to
- * get the metadata, that is, the parts of the flow key that cannot be
- * extracted from the packet itself.
- */
-int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
- const struct nlattr *attr)
-{
- const struct nlattr *nla;
- int rem;
-
- *in_port = USHRT_MAX;
- *priority = 0;
-
- nla_for_each_nested(nla, attr, rem) {
- int type = nla_type(nla);
-
- if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
- if (nla_len(nla) != ovs_key_lens[type])
- return -EINVAL;
-
- switch (type) {
- case OVS_KEY_ATTR_PRIORITY:
- *priority = nla_get_u32(nla);
- break;
-
- case OVS_KEY_ATTR_IN_PORT:
- if (nla_get_u32(nla) >= DP_MAX_PORTS)
- return -EINVAL;
- *in_port = nla_get_u32(nla);
- break;
- }
- }
- }
- if (rem)
- return -EINVAL;
- return 0;
-}
-
-int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
-{
- struct ovs_key_ethernet *eth_key;
- struct nlattr *nla, *encap;
-
- if (swkey->phy.priority)
- NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority);
-
- if (swkey->phy.in_port != USHRT_MAX)
- NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port);
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
- if (!nla)
- goto nla_put_failure;
- eth_key = nla_data(nla);
- memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
- memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
-
- if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
- NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q));
- NLA_PUT_BE16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci);
- encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.tci)
- goto unencap;
- } else {
- encap = NULL;
- }
-
- if (swkey->eth.type == htons(ETH_P_802_2))
- goto unencap;
-
- NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type);
-
- if (swkey->eth.type == htons(ETH_P_IP)) {
- struct ovs_key_ipv4 *ipv4_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
- if (!nla)
- goto nla_put_failure;
- ipv4_key = nla_data(nla);
- ipv4_key->ipv4_src = swkey->ipv4.addr.src;
- ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
- ipv4_key->ipv4_proto = swkey->ip.proto;
- ipv4_key->ipv4_tos = swkey->ip.tos;
- ipv4_key->ipv4_ttl = swkey->ip.ttl;
- ipv4_key->ipv4_frag = swkey->ip.frag;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- struct ovs_key_ipv6 *ipv6_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
- if (!nla)
- goto nla_put_failure;
- ipv6_key = nla_data(nla);
- memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
- sizeof(ipv6_key->ipv6_src));
- memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
- sizeof(ipv6_key->ipv6_dst));
- ipv6_key->ipv6_label = swkey->ipv6.label;
- ipv6_key->ipv6_proto = swkey->ip.proto;
- ipv6_key->ipv6_tclass = swkey->ip.tos;
- ipv6_key->ipv6_hlimit = swkey->ip.ttl;
- ipv6_key->ipv6_frag = swkey->ip.frag;
- } else if (swkey->eth.type == htons(ETH_P_ARP)) {
- struct ovs_key_arp *arp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
- if (!nla)
- goto nla_put_failure;
- arp_key = nla_data(nla);
- memset(arp_key, 0, sizeof(struct ovs_key_arp));
- arp_key->arp_sip = swkey->ipv4.addr.src;
- arp_key->arp_tip = swkey->ipv4.addr.dst;
- arp_key->arp_op = htons(swkey->ip.proto);
- memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
- memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
- }
-
- if ((swkey->eth.type == htons(ETH_P_IP) ||
- swkey->eth.type == htons(ETH_P_IPV6)) &&
- swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-
- if (swkey->ip.proto == IPPROTO_TCP) {
- struct ovs_key_tcp *tcp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
- if (!nla)
- goto nla_put_failure;
- tcp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- tcp_key->tcp_src = swkey->ipv4.tp.src;
- tcp_key->tcp_dst = swkey->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- tcp_key->tcp_src = swkey->ipv6.tp.src;
- tcp_key->tcp_dst = swkey->ipv6.tp.dst;
- }
- } else if (swkey->ip.proto == IPPROTO_UDP) {
- struct ovs_key_udp *udp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
- if (!nla)
- goto nla_put_failure;
- udp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- udp_key->udp_src = swkey->ipv4.tp.src;
- udp_key->udp_dst = swkey->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- udp_key->udp_src = swkey->ipv6.tp.src;
- udp_key->udp_dst = swkey->ipv6.tp.dst;
- }
- } else if (swkey->eth.type == htons(ETH_P_IP) &&
- swkey->ip.proto == IPPROTO_ICMP) {
- struct ovs_key_icmp *icmp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
- if (!nla)
- goto nla_put_failure;
- icmp_key = nla_data(nla);
- icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
- icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
- } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
- swkey->ip.proto == IPPROTO_ICMPV6) {
- struct ovs_key_icmpv6 *icmpv6_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
- sizeof(*icmpv6_key));
- if (!nla)
- goto nla_put_failure;
- icmpv6_key = nla_data(nla);
- icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
- icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
-
- if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
- icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
- struct ovs_key_nd *nd_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
- if (!nla)
- goto nla_put_failure;
- nd_key = nla_data(nla);
- memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
- sizeof(nd_key->nd_target));
- memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
- memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
+ error = parse_icmpv6(skb, key, nh_len);
+ if (error)
+ return error;
}
}
}
-unencap:
- if (encap)
- nla_nest_end(skb, encap);
-
- return 0;
-
-nla_put_failure:
- return -EMSGSIZE;
-}
-
-/* Initializes the flow module.
- * Returns zero if successful or a negative error code. */
-int ovs_flow_init(void)
-{
- flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
- 0, NULL);
- if (flow_cache == NULL)
- return -ENOMEM;
-
return 0;
}
-
-/* Uninitializes the flow module. */
-void ovs_flow_exit(void)
-{
- kmem_cache_destroy(flow_cache);
-}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 2747dc2c4ac..5e5aaed3a85 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2014 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -19,6 +19,7 @@
#ifndef FLOW_H
#define FLOW_H 1
+#include <linux/cache.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/openvswitch.h>
@@ -34,17 +35,43 @@
struct sk_buff;
-struct sw_flow_actions {
- struct rcu_head rcu;
- u32 actions_len;
- struct nlattr actions[];
-};
+/* Used to memset ovs_key_ipv4_tunnel padding. */
+#define OVS_TUNNEL_KEY_SIZE \
+ (offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \
+ FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl))
+
+struct ovs_key_ipv4_tunnel {
+ __be64 tun_id;
+ __be32 ipv4_src;
+ __be32 ipv4_dst;
+ __be16 tun_flags;
+ u8 ipv4_tos;
+ u8 ipv4_ttl;
+} __packed __aligned(4); /* Minimize padding. */
+
+static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
+ const struct iphdr *iph, __be64 tun_id,
+ __be16 tun_flags)
+{
+ tun_key->tun_id = tun_id;
+ tun_key->ipv4_src = iph->saddr;
+ tun_key->ipv4_dst = iph->daddr;
+ tun_key->ipv4_tos = iph->tos;
+ tun_key->ipv4_ttl = iph->ttl;
+ tun_key->tun_flags = tun_flags;
+
+ /* clear struct padding. */
+ memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
+ sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
+}
struct sw_flow_key {
+ struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
- u16 in_port; /* Input switch port (or USHRT_MAX). */
- } phy;
+ u32 skb_mark; /* SKB mark. */
+ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
+ } __packed phy; /* Safe when right after 'tun_key'. */
struct {
u8 src[ETH_ALEN]; /* Ethernet source address. */
u8 dst[ETH_ALEN]; /* Ethernet destination address. */
@@ -57,22 +84,21 @@ struct sw_flow_key {
u8 ttl; /* IP TTL/hop limit. */
u8 frag; /* One of OVS_FRAG_TYPE_*. */
} ip;
+ struct {
+ __be16 src; /* TCP/UDP/SCTP source port. */
+ __be16 dst; /* TCP/UDP/SCTP destination port. */
+ __be16 flags; /* TCP flags. */
+ } tp;
union {
struct {
struct {
__be32 src; /* IP source address. */
__be32 dst; /* IP destination address. */
} addr;
- union {
- struct {
- __be16 src; /* TCP/UDP source port. */
- __be16 dst; /* TCP/UDP destination port. */
- } tp;
- struct {
- u8 sha[ETH_ALEN]; /* ARP source hardware address. */
- u8 tha[ETH_ALEN]; /* ARP target hardware address. */
- } arp;
- };
+ struct {
+ u8 sha[ETH_ALEN]; /* ARP source hardware address. */
+ u8 tha[ETH_ALEN]; /* ARP target hardware address. */
+ } arp;
} ipv4;
struct {
struct {
@@ -81,31 +107,63 @@ struct sw_flow_key {
} addr;
__be32 label; /* IPv6 flow label. */
struct {
- __be16 src; /* TCP/UDP source port. */
- __be16 dst; /* TCP/UDP destination port. */
- } tp;
- struct {
struct in6_addr target; /* ND target address. */
u8 sll[ETH_ALEN]; /* ND source link layer address. */
u8 tll[ETH_ALEN]; /* ND target link layer address. */
} nd;
} ipv6;
};
+} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+
+struct sw_flow_key_range {
+ unsigned short int start;
+ unsigned short int end;
+};
+
+struct sw_flow_mask {
+ int ref_count;
+ struct rcu_head rcu;
+ struct list_head list;
+ struct sw_flow_key_range range;
+ struct sw_flow_key key;
+};
+
+struct sw_flow_match {
+ struct sw_flow_key *key;
+ struct sw_flow_key_range range;
+ struct sw_flow_mask *mask;
+};
+
+struct sw_flow_actions {
+ struct rcu_head rcu;
+ u32 actions_len;
+ struct nlattr actions[];
+};
+
+struct flow_stats {
+ u64 packet_count; /* Number of packets matched. */
+ u64 byte_count; /* Number of bytes matched. */
+ unsigned long used; /* Last used time (in jiffies). */
+ spinlock_t lock; /* Lock for atomic stats update. */
+ __be16 tcp_flags; /* Union of seen TCP flags. */
};
struct sw_flow {
struct rcu_head rcu;
struct hlist_node hash_node[2];
u32 hash;
-
+ int stats_last_writer; /* NUMA-node id of the last writer on
+ * 'stats[0]'.
+ */
struct sw_flow_key key;
+ struct sw_flow_key unmasked_key;
+ struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
-
- spinlock_t lock; /* Lock for values below. */
- unsigned long used; /* Last used time (in jiffies). */
- u64 packet_count; /* Number of packets matched. */
- u64 byte_count; /* Number of bytes matched. */
- u8 tcp_flags; /* Union of seen TCP flags. */
+ struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
+ * is allocated at flow creation time,
+ * the rest are allocated on demand
+ * while holding the 'stats[0].lock'.
+ */
};
struct arp_eth_header {
@@ -122,78 +180,13 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
-int ovs_flow_init(void);
-void ovs_flow_exit(void);
-
-struct sw_flow *ovs_flow_alloc(void);
-void ovs_flow_deferred_free(struct sw_flow *);
-void ovs_flow_free(struct sw_flow *flow);
-
-struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *);
-void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
-
-int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
- int *key_lenp);
-void ovs_flow_used(struct sw_flow *, struct sk_buff *);
+void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags,
+ struct sk_buff *);
+void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
+ unsigned long *used, __be16 *tcp_flags);
+void ovs_flow_stats_clear(struct sw_flow *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
-/* Upper bound on the length of a nlattr-formatted flow key. The longest
- * nlattr-formatted flow key would be:
- *
- * struct pad nl hdr total
- * ------ --- ------ -----
- * OVS_KEY_ATTR_PRIORITY 4 -- 4 8
- * OVS_KEY_ATTR_IN_PORT 4 -- 4 8
- * OVS_KEY_ATTR_ETHERNET 12 -- 4 16
- * OVS_KEY_ATTR_8021Q 4 -- 4 8
- * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8
- * OVS_KEY_ATTR_IPV6 40 -- 4 44
- * OVS_KEY_ATTR_ICMPV6 2 2 4 8
- * OVS_KEY_ATTR_ND 28 -- 4 32
- * -------------------------------------------------
- * total 132
- */
-#define FLOW_BUFSIZE 132
-
-int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
- const struct nlattr *);
-int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
- const struct nlattr *);
-
-#define TBL_MIN_BUCKETS 1024
-
-struct flow_table {
- struct flex_array *buckets;
- unsigned int count, n_buckets;
- struct rcu_head rcu;
- int node_ver;
- u32 hash_seed;
- bool keep_flows;
-};
-
-static inline int ovs_flow_tbl_count(struct flow_table *table)
-{
- return table->count;
-}
-
-static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
-{
- return (table->count > table->n_buckets);
-}
-
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
- struct sw_flow_key *key, int len);
-void ovs_flow_tbl_destroy(struct flow_table *table);
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
-struct flow_table *ovs_flow_tbl_alloc(int new_size);
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
-struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow);
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
-u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len);
-
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
-extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
+int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
#endif /* flow.h */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
new file mode 100644
index 00000000000..d757848da89
--- /dev/null
+++ b/net/openvswitch/flow_netlink.c
@@ -0,0 +1,1576 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#include "flow_netlink.h"
+
+static void update_range__(struct sw_flow_match *match,
+ size_t offset, size_t size, bool is_mask)
+{
+ struct sw_flow_key_range *range = NULL;
+ size_t start = rounddown(offset, sizeof(long));
+ size_t end = roundup(offset + size, sizeof(long));
+
+ if (!is_mask)
+ range = &match->range;
+ else if (match->mask)
+ range = &match->mask->range;
+
+ if (!range)
+ return;
+
+ if (range->start == range->end) {
+ range->start = start;
+ range->end = end;
+ return;
+ }
+
+ if (range->start > start)
+ range->start = start;
+
+ if (range->end < end)
+ range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ sizeof((match)->key->field), is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ (match)->mask->key.field = value; \
+ } else { \
+ (match)->key->field = value; \
+ } \
+ } while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ len, is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ memcpy(&(match)->mask->key.field, value_p, len);\
+ } else { \
+ memcpy(&(match)->key->field, value_p, len); \
+ } \
+ } while (0)
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+static bool match_validate(const struct sw_flow_match *match,
+ u64 key_attrs, u64 mask_attrs)
+{
+ u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 mask_allowed = key_attrs; /* At most allow all key attributes */
+
+ /* The following mask attributes allowed only if they
+ * pass the validation tests. */
+ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_TCP)
+ | (1 << OVS_KEY_ATTR_TCP_FLAGS)
+ | (1 << OVS_KEY_ATTR_UDP)
+ | (1 << OVS_KEY_ATTR_SCTP)
+ | (1 << OVS_KEY_ATTR_ICMP)
+ | (1 << OVS_KEY_ATTR_ICMPV6)
+ | (1 << OVS_KEY_ATTR_ARP)
+ | (1 << OVS_KEY_ATTR_ND));
+
+ /* Always allowed mask fields. */
+ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+ | (1 << OVS_KEY_ATTR_IN_PORT)
+ | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+ /* Check key attributes. */
+ if (match->key->eth.type == htons(ETH_P_ARP)
+ || match->key->eth.type == htons(ETH_P_RARP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ARP;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMP) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+ }
+ }
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IPV6)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMPV6) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+ if (match->key->tp.src ==
+ htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ND;
+ if (match->mask && (match->mask->key.tp.src == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+ }
+ }
+ }
+ }
+
+ if ((key_attrs & key_expected) != key_expected) {
+ /* Key attributes check failed. */
+ OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+ (unsigned long long)key_attrs, (unsigned long long)key_expected);
+ return false;
+ }
+
+ if ((mask_attrs & mask_allowed) != mask_attrs) {
+ /* Mask attributes check failed. */
+ OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+ (unsigned long long)mask_attrs, (unsigned long long)mask_allowed);
+ return false;
+ }
+
+ return true;
+}
+
+/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
+static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
+ [OVS_KEY_ATTR_ENCAP] = -1,
+ [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
+ [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
+ [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
+ [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
+ [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
+ [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
+ [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
+ [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
+ [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
+ [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16),
+ [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
+ [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
+ [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
+ [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
+ [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
+ [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
+ [OVS_KEY_ATTR_TUNNEL] = -1,
+};
+
+static bool is_all_zero(const u8 *fp, size_t size)
+{
+ int i;
+
+ if (!fp)
+ return false;
+
+ for (i = 0; i < size; i++)
+ if (fp[i])
+ return false;
+
+ return true;
+}
+
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[],
+ u64 *attrsp, bool nz)
+{
+ const struct nlattr *nla;
+ u64 attrs;
+ int rem;
+
+ attrs = *attrsp;
+ nla_for_each_nested(nla, attr, rem) {
+ u16 type = nla_type(nla);
+ int expected_len;
+
+ if (type > OVS_KEY_ATTR_MAX) {
+ OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
+ type, OVS_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (attrs & (1 << type)) {
+ OVS_NLERR("Duplicate key attribute (type %d).\n", type);
+ return -EINVAL;
+ }
+
+ expected_len = ovs_key_lens[type];
+ if (nla_len(nla) != expected_len && expected_len != -1) {
+ OVS_NLERR("Key attribute has unexpected length (type=%d"
+ ", length=%d, expected=%d).\n", type,
+ nla_len(nla), expected_len);
+ return -EINVAL;
+ }
+
+ if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+ attrs |= 1 << type;
+ a[type] = nla;
+ }
+ }
+ if (rem) {
+ OVS_NLERR("Message has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+
+ *attrsp = attrs;
+ return 0;
+}
+
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, true);
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, false);
+}
+
+static int ipv4_tun_from_nlattr(const struct nlattr *attr,
+ struct sw_flow_match *match, bool is_mask)
+{
+ struct nlattr *a;
+ int rem;
+ bool ttl = false;
+ __be16 tun_flags = 0;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+ [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
+ [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
+ [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
+ [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
+ [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
+ [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
+ [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+ };
+
+ if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+ OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
+ type, OVS_TUNNEL_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+ OVS_NLERR("IPv4 tunnel attribute type has unexpected "
+ " length (type=%d, length=%d, expected=%d).\n",
+ type, nla_len(a), ovs_tunnel_key_lens[type]);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_TUNNEL_KEY_ATTR_ID:
+ SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+ nla_get_be64(a), is_mask);
+ tun_flags |= TUNNEL_KEY;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+ nla_get_be32(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+ nla_get_be32(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TOS:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+ nla_get_u8(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TTL:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+ nla_get_u8(a), is_mask);
+ ttl = true;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
+ tun_flags |= TUNNEL_DONT_FRAGMENT;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_CSUM:
+ tun_flags |= TUNNEL_CSUM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+
+ if (rem > 0) {
+ OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+
+ if (!is_mask) {
+ if (!match->key->tun_key.ipv4_dst) {
+ OVS_NLERR("IPv4 tunnel destination address is zero.\n");
+ return -EINVAL;
+ }
+
+ if (!ttl) {
+ OVS_NLERR("IPv4 tunnel TTL not specified.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ipv4_tun_to_nlattr(struct sk_buff *skb,
+ const struct ovs_key_ipv4_tunnel *tun_key,
+ const struct ovs_key_ipv4_tunnel *output)
+{
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (output->tun_flags & TUNNEL_KEY &&
+ nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
+ return -EMSGSIZE;
+ if (output->ipv4_src &&
+ nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+ return -EMSGSIZE;
+ if (output->ipv4_dst &&
+ nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+ return -EMSGSIZE;
+ if (output->ipv4_tos &&
+ nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+ return -EMSGSIZE;
+ if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_CSUM) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+
+static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
+ const struct nlattr **a, bool is_mask)
+{
+ if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+ SW_FLOW_KEY_PUT(match, phy.priority,
+ nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+ u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
+
+ if (is_mask)
+ in_port = 0xffffffff; /* Always exact match in_port. */
+ else if (in_port >= DP_MAX_PORTS)
+ return -EINVAL;
+
+ SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+ uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+
+ SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+ if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+ is_mask))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
+ }
+ return 0;
+}
+
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+ const struct nlattr **a, bool is_mask)
+{
+ int err;
+ u64 orig_attrs = attrs;
+
+ err = metadata_from_nlattrs(match, &attrs, a, is_mask);
+ if (err)
+ return err;
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
+ const struct ovs_key_ethernet *eth_key;
+
+ eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+ SW_FLOW_KEY_MEMCPY(match, eth.src,
+ eth_key->eth_src, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, eth.dst,
+ eth_key->eth_dst, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ __be16 tci;
+
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (is_mask)
+ OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
+ else
+ OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
+
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ } else if (!is_mask)
+ SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ const struct ovs_key_ipv4 *ipv4_key;
+
+ ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
+ if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
+ ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv4_key->ipv4_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv4_key->ipv4_tos, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv4_key->ipv4_ttl, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv4_key->ipv4_frag, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ ipv4_key->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ ipv4_key->ipv4_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
+ const struct ovs_key_ipv6 *ipv6_key;
+
+ ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
+ if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
+ ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ipv6.label,
+ ipv6_key->ipv6_label, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv6_key->ipv6_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv6_key->ipv6_tclass, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv6_key->ipv6_hlimit, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv6_key->ipv6_frag, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+ ipv6_key->ipv6_src,
+ sizeof(match->key->ipv6.addr.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+ ipv6_key->ipv6_dst,
+ sizeof(match->key->ipv6.addr.dst),
+ is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
+ const struct ovs_key_arp *arp_key;
+
+ arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+ if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+ OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
+ arp_key->arp_op);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ arp_key->arp_sip, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ arp_key->arp_tip, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ntohs(arp_key->arp_op), is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+ arp_key->arp_sha, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+ arp_key->arp_tha, ETH_ALEN, is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
+ const struct ovs_key_tcp *tcp_key;
+
+ tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+ SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
+ const struct ovs_key_udp *udp_key;
+
+ udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+ SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
+ const struct ovs_key_sctp *sctp_key;
+
+ sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
+ SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
+ const struct ovs_key_icmp *icmp_key;
+
+ icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+ SW_FLOW_KEY_PUT(match, tp.src,
+ htons(icmp_key->icmp_type), is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst,
+ htons(icmp_key->icmp_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
+ const struct ovs_key_icmpv6 *icmpv6_key;
+
+ icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+ SW_FLOW_KEY_PUT(match, tp.src,
+ htons(icmpv6_key->icmpv6_type), is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst,
+ htons(icmpv6_key->icmpv6_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ND)) {
+ const struct ovs_key_nd *nd_key;
+
+ nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+ nd_key->nd_target,
+ sizeof(match->key->ipv6.nd.target),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+ nd_key->nd_sll, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+ nd_key->nd_tll, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ND);
+ }
+
+ if (attrs != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sw_flow_mask_set(struct sw_flow_mask *mask,
+ struct sw_flow_key_range *range, u8 val)
+{
+ u8 *m = (u8 *)&mask->key + range->start;
+
+ mask->range = *range;
+ memset(m, val, range_n_bytes(range));
+}
+
+/**
+ * ovs_nla_get_match - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ */
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *key,
+ const struct nlattr *mask)
+{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ const struct nlattr *encap;
+ u64 key_attrs = 0;
+ u64 mask_attrs = 0;
+ bool encap_valid = false;
+ int err;
+
+ err = parse_flow_nlattrs(key, a, &key_attrs);
+ if (err)
+ return err;
+
+ if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+ (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
+ __be16 tci;
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+ OVS_NLERR("Invalid Vlan frame.\n");
+ return -EINVAL;
+ }
+
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ encap_valid = true;
+
+ if (tci & htons(VLAN_TAG_PRESENT)) {
+ err = parse_flow_nlattrs(encap, a, &key_attrs);
+ if (err)
+ return err;
+ } else if (!tci) {
+ /* Corner case for truncated 802.1Q header. */
+ if (nla_len(encap)) {
+ OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
+ return -EINVAL;
+ }
+ } else {
+ OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, key_attrs, a, false);
+ if (err)
+ return err;
+
+ if (mask) {
+ err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
+ if (err)
+ return err;
+
+ if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
+ __be16 eth_type = 0;
+ __be16 tci = 0;
+
+ if (!encap_valid) {
+ OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
+ return -EINVAL;
+ }
+
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (eth_type == htons(0xffff)) {
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
+ } else {
+ OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
+ ntohs(eth_type));
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
+ if (err)
+ return err;
+ } else {
+ /* Populate exact match flow's key mask. */
+ if (match->mask)
+ sw_flow_mask_set(match->mask, &match->range, 0xff);
+ }
+
+ if (!match_validate(match, key_attrs, mask_attrs))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
+ * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
+ * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence.
+ *
+ * This parses a series of Netlink attributes that form a flow key, which must
+ * take the same form accepted by flow_from_nlattrs(), but only enough of it to
+ * get the metadata, that is, the parts of the flow key that cannot be
+ * extracted from the packet itself.
+ */
+
+int ovs_nla_get_flow_metadata(struct sw_flow *flow,
+ const struct nlattr *attr)
+{
+ struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ u64 attrs = 0;
+ int err;
+ struct sw_flow_match match;
+
+ flow->key.phy.in_port = DP_MAX_PORTS;
+ flow->key.phy.priority = 0;
+ flow->key.phy.skb_mark = 0;
+ memset(tun_key, 0, sizeof(flow->key.tun_key));
+
+ err = parse_flow_nlattrs(attr, a, &attrs);
+ if (err)
+ return -EINVAL;
+
+ memset(&match, 0, sizeof(match));
+ match.key = &flow->key;
+
+ err = metadata_from_nlattrs(&match, &attrs, a, false);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int ovs_nla_put_flow(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb)
+{
+ struct ovs_key_ethernet *eth_key;
+ struct nlattr *nla, *encap;
+ bool is_mask = (swkey != output);
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
+ goto nla_put_failure;
+
+ if ((swkey->tun_key.ipv4_dst || is_mask) &&
+ ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
+ goto nla_put_failure;
+
+ if (swkey->phy.in_port == DP_MAX_PORTS) {
+ if (is_mask && (output->phy.in_port == 0xffff))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+ goto nla_put_failure;
+ } else {
+ u16 upper_u16;
+ upper_u16 = !is_mask ? 0 : 0xffff;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+ (upper_u16 << 16) | output->phy.in_port))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
+ goto nla_put_failure;
+
+ eth_key = nla_data(nla);
+ ether_addr_copy(eth_key->eth_src, output->eth.src);
+ ether_addr_copy(eth_key->eth_dst, output->eth.dst);
+
+ if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
+ __be16 eth_type;
+ eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+ nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+ goto nla_put_failure;
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.tci)
+ goto unencap;
+ } else
+ encap = NULL;
+
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
+
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
+ goto nla_put_failure;
+
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ipv4 *ipv4_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv4_key = nla_data(nla);
+ ipv4_key->ipv4_src = output->ipv4.addr.src;
+ ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+ ipv4_key->ipv4_proto = output->ip.proto;
+ ipv4_key->ipv4_tos = output->ip.tos;
+ ipv4_key->ipv4_ttl = output->ip.ttl;
+ ipv4_key->ipv4_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ipv6 *ipv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_key = nla_data(nla);
+ memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
+ sizeof(ipv6_key->ipv6_src));
+ memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
+ sizeof(ipv6_key->ipv6_dst));
+ ipv6_key->ipv6_label = output->ipv6.label;
+ ipv6_key->ipv6_proto = output->ip.proto;
+ ipv6_key->ipv6_tclass = output->ip.tos;
+ ipv6_key->ipv6_hlimit = output->ip.ttl;
+ ipv6_key->ipv6_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+ swkey->eth.type == htons(ETH_P_RARP)) {
+ struct ovs_key_arp *arp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
+ if (!nla)
+ goto nla_put_failure;
+ arp_key = nla_data(nla);
+ memset(arp_key, 0, sizeof(struct ovs_key_arp));
+ arp_key->arp_sip = output->ipv4.addr.src;
+ arp_key->arp_tip = output->ipv4.addr.dst;
+ arp_key->arp_op = htons(output->ip.proto);
+ ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
+ ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
+ }
+
+ if ((swkey->eth.type == htons(ETH_P_IP) ||
+ swkey->eth.type == htons(ETH_P_IPV6)) &&
+ swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+
+ if (swkey->ip.proto == IPPROTO_TCP) {
+ struct ovs_key_tcp *tcp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
+ if (!nla)
+ goto nla_put_failure;
+ tcp_key = nla_data(nla);
+ tcp_key->tcp_src = output->tp.src;
+ tcp_key->tcp_dst = output->tp.dst;
+ if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+ output->tp.flags))
+ goto nla_put_failure;
+ } else if (swkey->ip.proto == IPPROTO_UDP) {
+ struct ovs_key_udp *udp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
+ if (!nla)
+ goto nla_put_failure;
+ udp_key = nla_data(nla);
+ udp_key->udp_src = output->tp.src;
+ udp_key->udp_dst = output->tp.dst;
+ } else if (swkey->ip.proto == IPPROTO_SCTP) {
+ struct ovs_key_sctp *sctp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
+ if (!nla)
+ goto nla_put_failure;
+ sctp_key = nla_data(nla);
+ sctp_key->sctp_src = output->tp.src;
+ sctp_key->sctp_dst = output->tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IP) &&
+ swkey->ip.proto == IPPROTO_ICMP) {
+ struct ovs_key_icmp *icmp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmp_key = nla_data(nla);
+ icmp_key->icmp_type = ntohs(output->tp.src);
+ icmp_key->icmp_code = ntohs(output->tp.dst);
+ } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
+ swkey->ip.proto == IPPROTO_ICMPV6) {
+ struct ovs_key_icmpv6 *icmpv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
+ sizeof(*icmpv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmpv6_key = nla_data(nla);
+ icmpv6_key->icmpv6_type = ntohs(output->tp.src);
+ icmpv6_key->icmpv6_code = ntohs(output->tp.dst);
+
+ if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
+ struct ovs_key_nd *nd_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
+ if (!nla)
+ goto nla_put_failure;
+ nd_key = nla_data(nla);
+ memcpy(nd_key->nd_target, &output->ipv6.nd.target,
+ sizeof(nd_key->nd_target));
+ ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll);
+ ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll);
+ }
+ }
+ }
+
+unencap:
+ if (encap)
+ nla_nest_end(skb, encap);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+#define MAX_ACTIONS_BUFSIZE (32 * 1024)
+
+struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size)
+{
+ struct sw_flow_actions *sfa;
+
+ if (size > MAX_ACTIONS_BUFSIZE)
+ return ERR_PTR(-EINVAL);
+
+ sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
+ if (!sfa)
+ return ERR_PTR(-ENOMEM);
+
+ sfa->actions_len = 0;
+ return sfa;
+}
+
+/* Schedules 'sf_acts' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ kfree_rcu(sf_acts, rcu);
+}
+
+static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
+ int attr_len)
+{
+
+ struct sw_flow_actions *acts;
+ int new_acts_size;
+ int req_size = NLA_ALIGN(attr_len);
+ int next_offset = offsetof(struct sw_flow_actions, actions) +
+ (*sfa)->actions_len;
+
+ if (req_size <= (ksize(*sfa) - next_offset))
+ goto out;
+
+ new_acts_size = ksize(*sfa) * 2;
+
+ if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
+ if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
+ return ERR_PTR(-EMSGSIZE);
+ new_acts_size = MAX_ACTIONS_BUFSIZE;
+ }
+
+ acts = ovs_nla_alloc_flow_actions(new_acts_size);
+ if (IS_ERR(acts))
+ return (void *)acts;
+
+ memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
+ acts->actions_len = (*sfa)->actions_len;
+ kfree(*sfa);
+ *sfa = acts;
+
+out:
+ (*sfa)->actions_len += req_size;
+ return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
+}
+
+static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
+{
+ struct nlattr *a;
+
+ a = reserve_sfa_size(sfa, nla_attr_size(len));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ a->nla_type = attrtype;
+ a->nla_len = nla_attr_size(len);
+
+ if (data)
+ memcpy(nla_data(a), data, len);
+ memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
+
+ return 0;
+}
+
+static inline int add_nested_action_start(struct sw_flow_actions **sfa,
+ int attrtype)
+{
+ int used = (*sfa)->actions_len;
+ int err;
+
+ err = add_action(sfa, attrtype, NULL, 0);
+ if (err)
+ return err;
+
+ return used;
+}
+
+static inline void add_nested_action_end(struct sw_flow_actions *sfa,
+ int st_offset)
+{
+ struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions +
+ st_offset);
+
+ a->nla_len = sfa->actions_len - st_offset;
+}
+
+static int validate_and_copy_sample(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa)
+{
+ const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
+ const struct nlattr *probability, *actions;
+ const struct nlattr *a;
+ int rem, start, err, st_acts;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
+ return -EINVAL;
+ attrs[type] = a;
+ }
+ if (rem)
+ return -EINVAL;
+
+ probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
+ if (!probability || nla_len(probability) != sizeof(u32))
+ return -EINVAL;
+
+ actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
+
+ /* validation done, copy sample action. */
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
+ if (start < 0)
+ return start;
+ err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
+ nla_data(probability), sizeof(u32));
+ if (err)
+ return err;
+ st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
+ if (st_acts < 0)
+ return st_acts;
+
+ err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, st_acts);
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
+static int validate_tp_port(const struct sw_flow_key *flow_key)
+{
+ if ((flow_key->eth.type == htons(ETH_P_IP) ||
+ flow_key->eth.type == htons(ETH_P_IPV6)) &&
+ (flow_key->tp.src || flow_key->tp.dst))
+ return 0;
+
+ return -EINVAL;
+}
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key,
+ struct sw_flow_mask *mask)
+{
+ memset(match, 0, sizeof(*match));
+ match->key = key;
+ match->mask = mask;
+
+ memset(key, 0, sizeof(*key));
+
+ if (mask) {
+ memset(&mask->key, 0, sizeof(mask->key));
+ mask->range.start = mask->range.end = 0;
+ }
+}
+
+static int validate_and_copy_set_tun(const struct nlattr *attr,
+ struct sw_flow_actions **sfa)
+{
+ struct sw_flow_match match;
+ struct sw_flow_key key;
+ int err, start;
+
+ ovs_match_init(&match, &key, NULL);
+ err = ipv4_tun_from_nlattr(nla_data(attr), &match, false);
+ if (err)
+ return err;
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
+ if (start < 0)
+ return start;
+
+ err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
+ sizeof(match.key->tun_key));
+ add_nested_action_end(*sfa, start);
+
+ return err;
+}
+
+static int validate_set(const struct nlattr *a,
+ const struct sw_flow_key *flow_key,
+ struct sw_flow_actions **sfa,
+ bool *set_tun)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+
+ /* There can be only one key in a action */
+ if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
+ return -EINVAL;
+
+ if (key_type > OVS_KEY_ATTR_MAX ||
+ (ovs_key_lens[key_type] != nla_len(ovs_key) &&
+ ovs_key_lens[key_type] != -1))
+ return -EINVAL;
+
+ switch (key_type) {
+ const struct ovs_key_ipv4 *ipv4_key;
+ const struct ovs_key_ipv6 *ipv6_key;
+ int err;
+
+ case OVS_KEY_ATTR_PRIORITY:
+ case OVS_KEY_ATTR_SKB_MARK:
+ case OVS_KEY_ATTR_ETHERNET:
+ break;
+
+ case OVS_KEY_ATTR_TUNNEL:
+ *set_tun = true;
+ err = validate_and_copy_set_tun(a, sfa);
+ if (err)
+ return err;
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ if (flow_key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (!flow_key->ip.proto)
+ return -EINVAL;
+
+ ipv4_key = nla_data(ovs_key);
+ if (ipv4_key->ipv4_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv4_key->ipv4_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_IPV6:
+ if (flow_key->eth.type != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ if (!flow_key->ip.proto)
+ return -EINVAL;
+
+ ipv6_key = nla_data(ovs_key);
+ if (ipv6_key->ipv6_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv6_key->ipv6_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ if (flow_key->ip.proto != IPPROTO_TCP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ case OVS_KEY_ATTR_UDP:
+ if (flow_key->ip.proto != IPPROTO_UDP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ case OVS_KEY_ATTR_SCTP:
+ if (flow_key->ip.proto != IPPROTO_SCTP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int validate_userspace(const struct nlattr *attr)
+{
+ static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
+ [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
+ [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
+ };
+ struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
+ int error;
+
+ error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
+ attr, userspace_policy);
+ if (error)
+ return error;
+
+ if (!a[OVS_USERSPACE_ATTR_PID] ||
+ !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int copy_action(const struct nlattr *from,
+ struct sw_flow_actions **sfa)
+{
+ int totlen = NLA_ALIGN(from->nla_len);
+ struct nlattr *to;
+
+ to = reserve_sfa_size(sfa, from->nla_len);
+ if (IS_ERR(to))
+ return PTR_ERR(to);
+
+ memcpy(to, from, totlen);
+ return 0;
+}
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth,
+ struct sw_flow_actions **sfa)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ if (depth >= SAMPLE_ACTION_DEPTH)
+ return -EOVERFLOW;
+
+ nla_for_each_nested(a, attr, rem) {
+ /* Expected argument lengths, (u32)-1 for variable length. */
+ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
+ [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
+ [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
+ [OVS_ACTION_ATTR_POP_VLAN] = 0,
+ [OVS_ACTION_ATTR_SET] = (u32)-1,
+ [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
+ };
+ const struct ovs_action_push_vlan *vlan;
+ int type = nla_type(a);
+ bool skip_copy;
+
+ if (type > OVS_ACTION_ATTR_MAX ||
+ (action_lens[type] != nla_len(a) &&
+ action_lens[type] != (u32)-1))
+ return -EINVAL;
+
+ skip_copy = false;
+ switch (type) {
+ case OVS_ACTION_ATTR_UNSPEC:
+ return -EINVAL;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ err = validate_userspace(a);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_OUTPUT:
+ if (nla_get_u32(a) >= DP_MAX_PORTS)
+ return -EINVAL;
+ break;
+
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ vlan = nla_data(a);
+ if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ return -EINVAL;
+ if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ return -EINVAL;
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = validate_set(a, key, sfa, &skip_copy);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = validate_and_copy_sample(a, key, depth, sfa);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (!skip_copy) {
+ err = copy_action(a, sfa);
+ if (err)
+ return err;
+ }
+ }
+
+ if (rem > 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ struct nlattr *start;
+ int err = 0, rem;
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
+ if (!start)
+ return -EMSGSIZE;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ struct nlattr *st_sample;
+
+ switch (type) {
+ case OVS_SAMPLE_ATTR_PROBABILITY:
+ if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_SAMPLE_ATTR_ACTIONS:
+ st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
+ if (!st_sample)
+ return -EMSGSIZE;
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ return err;
+ nla_nest_end(skb, st_sample);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return err;
+}
+
+static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+ struct nlattr *start;
+ int err;
+
+ switch (key_type) {
+ case OVS_KEY_ATTR_IPV4_TUNNEL:
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
+ nla_data(ovs_key));
+ if (err)
+ return err;
+ nla_nest_end(skb, start);
+ break;
+ default:
+ if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
+ return -EMSGSIZE;
+ break;
+ }
+
+ return 0;
+}
+
+int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ nla_for_each_attr(a, attr, len, rem) {
+ int type = nla_type(a);
+
+ switch (type) {
+ case OVS_ACTION_ATTR_SET:
+ err = set_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = sample_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+ default:
+ if (nla_put(skb, type, nla_len(a), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
new file mode 100644
index 00000000000..440151045d3
--- /dev/null
+++ b/net/openvswitch/flow_netlink.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+
+#ifndef FLOW_NETLINK_H
+#define FLOW_NETLINK_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key, struct sw_flow_mask *mask);
+
+int ovs_nla_put_flow(const struct sw_flow_key *,
+ const struct sw_flow_key *, struct sk_buff *);
+int ovs_nla_get_flow_metadata(struct sw_flow *flow,
+ const struct nlattr *attr);
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *,
+ const struct nlattr *);
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa);
+int ovs_nla_put_actions(const struct nlattr *attr,
+ int len, struct sk_buff *skb);
+
+struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len);
+void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+
+#endif /* flow_netlink.h */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
new file mode 100644
index 00000000000..cf2d853646f
--- /dev/null
+++ b/net/openvswitch/flow_table.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#define TBL_MIN_BUCKETS 1024
+#define REHASH_INTERVAL (10 * 60 * HZ)
+
+static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask)
+{
+ const long *m = (const long *)((const u8 *)&mask->key +
+ mask->range.start);
+ const long *s = (const long *)((const u8 *)src +
+ mask->range.start);
+ long *d = (long *)((u8 *)dst + mask->range.start);
+ int i;
+
+ /* The memory outside of the 'mask->range' are not set since
+ * further operations on 'dst' only uses contents within
+ * 'mask->range'.
+ */
+ for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
+ *d++ = *s++ & *m++;
+}
+
+struct sw_flow *ovs_flow_alloc(void)
+{
+ struct sw_flow *flow;
+ struct flow_stats *stats;
+ int node;
+
+ flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ flow->sf_acts = NULL;
+ flow->mask = NULL;
+ flow->stats_last_writer = NUMA_NO_NODE;
+
+ /* Initialize the default stat node. */
+ stats = kmem_cache_alloc_node(flow_stats_cache,
+ GFP_KERNEL | __GFP_ZERO, 0);
+ if (!stats)
+ goto err;
+
+ spin_lock_init(&stats->lock);
+
+ RCU_INIT_POINTER(flow->stats[0], stats);
+
+ for_each_node(node)
+ if (node != 0)
+ RCU_INIT_POINTER(flow->stats[node], NULL);
+
+ return flow;
+err:
+ kmem_cache_free(flow_cache, flow);
+ return ERR_PTR(-ENOMEM);
+}
+
+int ovs_flow_tbl_count(struct flow_table *table)
+{
+ return table->count;
+}
+
+static struct flex_array *alloc_buckets(unsigned int n_buckets)
+{
+ struct flex_array *buckets;
+ int i, err;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
+ n_buckets, GFP_KERNEL);
+ if (!buckets)
+ return NULL;
+
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return NULL;
+ }
+
+ for (i = 0; i < n_buckets; i++)
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
+
+ return buckets;
+}
+
+static void flow_free(struct sw_flow *flow)
+{
+ int node;
+
+ kfree((struct sw_flow_actions __force *)flow->sf_acts);
+ for_each_node(node)
+ if (flow->stats[node])
+ kmem_cache_free(flow_stats_cache,
+ (struct flow_stats __force *)flow->stats[node]);
+ kmem_cache_free(flow_cache, flow);
+}
+
+static void rcu_free_flow_callback(struct rcu_head *rcu)
+{
+ struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
+
+ flow_free(flow);
+}
+
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
+{
+ if (!flow)
+ return;
+
+ if (deferred)
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+ else
+ flow_free(flow);
+}
+
+static void free_buckets(struct flex_array *buckets)
+{
+ flex_array_free(buckets);
+}
+
+
+static void __table_instance_destroy(struct table_instance *ti)
+{
+ free_buckets(ti->buckets);
+ kfree(ti);
+}
+
+static struct table_instance *table_instance_alloc(int new_size)
+{
+ struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+
+ if (!ti)
+ return NULL;
+
+ ti->buckets = alloc_buckets(new_size);
+
+ if (!ti->buckets) {
+ kfree(ti);
+ return NULL;
+ }
+ ti->n_buckets = new_size;
+ ti->node_ver = 0;
+ ti->keep_flows = false;
+ get_random_bytes(&ti->hash_seed, sizeof(u32));
+
+ return ti;
+}
+
+int ovs_flow_tbl_init(struct flow_table *table)
+{
+ struct table_instance *ti;
+
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
+
+ if (!ti)
+ return -ENOMEM;
+
+ rcu_assign_pointer(table->ti, ti);
+ INIT_LIST_HEAD(&table->mask_list);
+ table->last_rehash = jiffies;
+ table->count = 0;
+ return 0;
+}
+
+static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
+{
+ struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+
+ __table_instance_destroy(ti);
+}
+
+static void table_instance_destroy(struct table_instance *ti, bool deferred)
+{
+ int i;
+
+ if (!ti)
+ return;
+
+ if (ti->keep_flows)
+ goto skip_flows;
+
+ for (i = 0; i < ti->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head = flex_array_get(ti->buckets, i);
+ struct hlist_node *n;
+ int ver = ti->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+ hlist_del_rcu(&flow->hash_node[ver]);
+ ovs_flow_free(flow, deferred);
+ }
+ }
+
+skip_flows:
+ if (deferred)
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ else
+ __table_instance_destroy(ti);
+}
+
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+
+ table_instance_destroy(ti, deferred);
+}
+
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
+ u32 *bucket, u32 *last)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int ver;
+ int i;
+
+ ver = ti->node_ver;
+ while (*bucket < ti->n_buckets) {
+ i = 0;
+ head = flex_array_get(ti->buckets, *bucket);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
+ if (i < *last) {
+ i++;
+ continue;
+ }
+ *last = i + 1;
+ return flow;
+ }
+ (*bucket)++;
+ *last = 0;
+ }
+
+ return NULL;
+}
+
+static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
+{
+ hash = jhash_1word(hash, ti->hash_seed);
+ return flex_array_get(ti->buckets,
+ (hash & (ti->n_buckets - 1)));
+}
+
+static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow)
+{
+ struct hlist_head *head;
+
+ head = find_bucket(ti, flow->hash);
+ hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head);
+}
+
+static void flow_table_copy_flows(struct table_instance *old,
+ struct table_instance *new)
+{
+ int old_ver;
+ int i;
+
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
+
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head;
+
+ head = flex_array_get(old->buckets, i);
+
+ hlist_for_each_entry(flow, head, hash_node[old_ver])
+ table_instance_insert(new, flow);
+ }
+
+ old->keep_flows = true;
+}
+
+static struct table_instance *table_instance_rehash(struct table_instance *ti,
+ int n_buckets)
+{
+ struct table_instance *new_ti;
+
+ new_ti = table_instance_alloc(n_buckets);
+ if (!new_ti)
+ return NULL;
+
+ flow_table_copy_flows(ti, new_ti);
+
+ return new_ti;
+}
+
+int ovs_flow_tbl_flush(struct flow_table *flow_table)
+{
+ struct table_instance *old_ti;
+ struct table_instance *new_ti;
+
+ old_ti = ovsl_dereference(flow_table->ti);
+ new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!new_ti)
+ return -ENOMEM;
+
+ rcu_assign_pointer(flow_table->ti, new_ti);
+ flow_table->last_rehash = jiffies;
+ flow_table->count = 0;
+
+ table_instance_destroy(old_ti, true);
+ return 0;
+}
+
+static u32 flow_hash(const struct sw_flow_key *key, int key_start,
+ int key_end)
+{
+ const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
+ int hash_u32s = (key_end - key_start) >> 2;
+
+ /* Make sure number of hash bytes are multiple of u32. */
+ BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+
+ return arch_fast_hash2(hash_key, hash_u32s, 0);
+}
+
+static int flow_key_start(const struct sw_flow_key *key)
+{
+ if (key->tun_key.ipv4_dst)
+ return 0;
+ else
+ return rounddown(offsetof(struct sw_flow_key, phy),
+ sizeof(long));
+}
+
+static bool cmp_key(const struct sw_flow_key *key1,
+ const struct sw_flow_key *key2,
+ int key_start, int key_end)
+{
+ const long *cp1 = (const long *)((const u8 *)key1 + key_start);
+ const long *cp2 = (const long *)((const u8 *)key2 + key_start);
+ long diffs = 0;
+ int i;
+
+ for (i = key_start; i < key_end; i += sizeof(long))
+ diffs |= *cp1++ ^ *cp2++;
+
+ return diffs == 0;
+}
+
+static bool flow_cmp_masked_key(const struct sw_flow *flow,
+ const struct sw_flow_key *key,
+ int key_start, int key_end)
+{
+ return cmp_key(&flow->key, key, key_start, key_end);
+}
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ struct sw_flow_match *match)
+{
+ struct sw_flow_key *key = match->key;
+ int key_start = flow_key_start(key);
+ int key_end = match->range.end;
+
+ return cmp_key(&flow->unmasked_key, key, key_start, key_end);
+}
+
+static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
+ const struct sw_flow_key *unmasked,
+ struct sw_flow_mask *mask)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int key_start = mask->range.start;
+ int key_end = mask->range.end;
+ u32 hash;
+ struct sw_flow_key masked_key;
+
+ ovs_flow_mask_key(&masked_key, unmasked, mask);
+ hash = flow_hash(&masked_key, key_start, key_end);
+ head = find_bucket(ti, hash);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) {
+ if (flow->mask == mask && flow->hash == hash &&
+ flow_cmp_masked_key(flow, &masked_key,
+ key_start, key_end))
+ return flow;
+ }
+ return NULL;
+}
+
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit)
+{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ *n_mask_hit = 0;
+ list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
+ (*n_mask_hit)++;
+ flow = masked_flow_lookup(ti, key, mask);
+ if (flow) /* Found */
+ return flow;
+ }
+ return NULL;
+}
+
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
+ const struct sw_flow_key *key)
+{
+ u32 __always_unused n_mask_hit;
+
+ return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
+}
+
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+ struct sw_flow_match *match)
+{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ /* Always called under ovs-mutex. */
+ list_for_each_entry(mask, &tbl->mask_list, list) {
+ flow = masked_flow_lookup(ti, match->key, mask);
+ if (flow && ovs_flow_cmp_unmasked_key(flow, match)) /* Found */
+ return flow;
+ }
+ return NULL;
+}
+
+int ovs_flow_tbl_num_masks(const struct flow_table *table)
+{
+ struct sw_flow_mask *mask;
+ int num = 0;
+
+ list_for_each_entry(mask, &table->mask_list, list)
+ num++;
+
+ return num;
+}
+
+static struct table_instance *table_instance_expand(struct table_instance *ti)
+{
+ return table_instance_rehash(ti, ti->n_buckets * 2);
+}
+
+/* Remove 'mask' from the mask list, if it is not needed any more. */
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+ if (mask) {
+ /* ovs-lock is required to protect mask-refcount and
+ * mask list.
+ */
+ ASSERT_OVSL();
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count) {
+ list_del_rcu(&mask->list);
+ kfree_rcu(mask, rcu);
+ }
+ }
+}
+
+/* Must be called with OVS mutex held. */
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+
+ BUG_ON(table->count == 0);
+ hlist_del_rcu(&flow->hash_node[ti->node_ver]);
+ table->count--;
+
+ /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
+ * accessible as long as the RCU read lock is held.
+ */
+ flow_mask_remove(table, flow->mask);
+}
+
+static struct sw_flow_mask *mask_alloc(void)
+{
+ struct sw_flow_mask *mask;
+
+ mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+ if (mask)
+ mask->ref_count = 1;
+
+ return mask;
+}
+
+static bool mask_equal(const struct sw_flow_mask *a,
+ const struct sw_flow_mask *b)
+{
+ const u8 *a_ = (const u8 *)&a->key + a->range.start;
+ const u8 *b_ = (const u8 *)&b->key + b->range.start;
+
+ return (a->range.end == b->range.end)
+ && (a->range.start == b->range.start)
+ && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
+}
+
+static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
+ const struct sw_flow_mask *mask)
+{
+ struct list_head *ml;
+
+ list_for_each(ml, &tbl->mask_list) {
+ struct sw_flow_mask *m;
+ m = container_of(ml, struct sw_flow_mask, list);
+ if (mask_equal(mask, m))
+ return m;
+ }
+
+ return NULL;
+}
+
+/* Add 'mask' into the mask list, if it is not already there. */
+static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
+ struct sw_flow_mask *new)
+{
+ struct sw_flow_mask *mask;
+ mask = flow_mask_find(tbl, new);
+ if (!mask) {
+ /* Allocate a new mask if none exsits. */
+ mask = mask_alloc();
+ if (!mask)
+ return -ENOMEM;
+ mask->key = new->key;
+ mask->range = new->range;
+ list_add_rcu(&mask->list, &tbl->mask_list);
+ } else {
+ BUG_ON(!mask->ref_count);
+ mask->ref_count++;
+ }
+
+ flow->mask = mask;
+ return 0;
+}
+
+/* Must be called with OVS mutex held. */
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ struct sw_flow_mask *mask)
+{
+ struct table_instance *new_ti = NULL;
+ struct table_instance *ti;
+ int err;
+
+ err = flow_mask_insert(table, flow, mask);
+ if (err)
+ return err;
+
+ flow->hash = flow_hash(&flow->key, flow->mask->range.start,
+ flow->mask->range.end);
+ ti = ovsl_dereference(table->ti);
+ table_instance_insert(ti, flow);
+ table->count++;
+
+ /* Expand table, if necessary, to make room. */
+ if (table->count > ti->n_buckets)
+ new_ti = table_instance_expand(ti);
+ else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
+ new_ti = table_instance_rehash(ti, ti->n_buckets);
+
+ if (new_ti) {
+ rcu_assign_pointer(table->ti, new_ti);
+ table_instance_destroy(ti, true);
+ table->last_rehash = jiffies;
+ }
+ return 0;
+}
+
+/* Initializes the flow module.
+ * Returns zero if successful or a negative error code. */
+int ovs_flow_init(void)
+{
+ BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
+ BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
+
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+ + (num_possible_nodes()
+ * sizeof(struct flow_stats *)),
+ 0, 0, NULL);
+ if (flow_cache == NULL)
+ return -ENOMEM;
+
+ flow_stats_cache
+ = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (flow_stats_cache == NULL) {
+ kmem_cache_destroy(flow_cache);
+ flow_cache = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/* Uninitializes the flow module. */
+void ovs_flow_exit(void)
+{
+ kmem_cache_destroy(flow_stats_cache);
+ kmem_cache_destroy(flow_cache);
+}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
new file mode 100644
index 00000000000..5918bff7f3f
--- /dev/null
+++ b/net/openvswitch/flow_table.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef FLOW_TABLE_H
+#define FLOW_TABLE_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+struct table_instance {
+ struct flex_array *buckets;
+ unsigned int n_buckets;
+ struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ bool keep_flows;
+};
+
+struct flow_table {
+ struct table_instance __rcu *ti;
+ struct list_head mask_list;
+ unsigned long last_rehash;
+ unsigned int count;
+};
+
+extern struct kmem_cache *flow_stats_cache;
+
+int ovs_flow_init(void);
+void ovs_flow_exit(void);
+
+struct sw_flow *ovs_flow_alloc(void);
+void ovs_flow_free(struct sw_flow *, bool deferred);
+
+int ovs_flow_tbl_init(struct flow_table *);
+int ovs_flow_tbl_count(struct flow_table *table);
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
+int ovs_flow_tbl_flush(struct flow_table *flow_table);
+
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ struct sw_flow_mask *mask);
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
+int ovs_flow_tbl_num_masks(const struct flow_table *table);
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
+ u32 *bucket, u32 *idx);
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
+ const struct sw_flow_key *,
+ u32 *n_mask_hit);
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
+ const struct sw_flow_key *);
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+ struct sw_flow_match *match);
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ struct sw_flow_match *match);
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask);
+#endif /* flow_table.h */
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
new file mode 100644
index 00000000000..f49148a07da
--- /dev/null
+++ b/net/openvswitch/vport-gre.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/gre.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/protocol.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 be64_get_low32(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be32)x;
+#else
+ return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
+static __be16 filter_tnl_flags(__be16 flags)
+{
+ return flags & (TUNNEL_CSUM | TUNNEL_KEY);
+}
+
+static struct sk_buff *__build_header(struct sk_buff *skb,
+ int tunnel_hlen)
+{
+ const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
+ struct tnl_ptk_info tpi;
+
+ skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ return NULL;
+
+ tpi.flags = filter_tnl_flags(tun_key->tun_flags);
+ tpi.proto = htons(ETH_P_TEB);
+ tpi.key = be64_get_low32(tun_key->tun_id);
+ tpi.seq = 0;
+ gre_build_header(skb, &tpi, tunnel_hlen);
+
+ return skb;
+}
+
+static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
+#else
+ return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
+#endif
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_rcv(struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi)
+{
+ struct ovs_key_ipv4_tunnel tun_key;
+ struct ovs_net *ovs_net;
+ struct vport *vport;
+ __be64 key;
+
+ ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
+ vport = rcu_dereference(ovs_net->vport_net.gre_vport);
+ if (unlikely(!vport))
+ return PACKET_REJECT;
+
+ key = key_to_tunnel_id(tpi->key, tpi->seq);
+ ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key,
+ filter_tnl_flags(tpi->flags));
+
+ ovs_vport_receive(vport, skb, &tun_key);
+ return PACKET_RCVD;
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
+{
+ struct ovs_net *ovs_net;
+ struct vport *vport;
+
+ ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
+ vport = rcu_dereference(ovs_net->vport_net.gre_vport);
+
+ if (unlikely(!vport))
+ return PACKET_REJECT;
+ else
+ return PACKET_RCVD;
+}
+
+static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct flowi4 fl;
+ struct rtable *rt;
+ int min_headroom;
+ int tunnel_hlen;
+ __be16 df;
+ int err;
+
+ if (unlikely(!OVS_CB(skb)->tun_key)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ /* Route lookup */
+ memset(&fl, 0, sizeof(fl));
+ fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+ fl.saddr = OVS_CB(skb)->tun_key->ipv4_src;
+ fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos);
+ fl.flowi4_mark = skb->mark;
+ fl.flowi4_proto = IPPROTO_GRE;
+
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + tunnel_hlen + sizeof(struct iphdr)
+ + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ if (vlan_tx_tag_present(skb)) {
+ if (unlikely(!__vlan_put_tag(skb,
+ skb->vlan_proto,
+ vlan_tx_tag_get(skb)))) {
+ err = -ENOMEM;
+ goto err_free_rt;
+ }
+ skb->vlan_tci = 0;
+ }
+
+ /* Push Tunnel header. */
+ skb = __build_header(skb, tunnel_hlen);
+ if (unlikely(!skb)) {
+ err = 0;
+ goto err_free_rt;
+ }
+
+ df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+ htons(IP_DF) : 0;
+
+ skb->ignore_df = 1;
+
+ return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+ OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE,
+ OVS_CB(skb)->tun_key->ipv4_tos,
+ OVS_CB(skb)->tun_key->ipv4_ttl, df, false);
+err_free_rt:
+ ip_rt_put(rt);
+error:
+ return err;
+}
+
+static struct gre_cisco_protocol gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .priority = 1,
+};
+
+static int gre_ports;
+static int gre_init(void)
+{
+ int err;
+
+ gre_ports++;
+ if (gre_ports > 1)
+ return 0;
+
+ err = gre_cisco_register(&gre_protocol);
+ if (err)
+ pr_warn("cannot register gre protocol handler\n");
+
+ return err;
+}
+
+static void gre_exit(void)
+{
+ gre_ports--;
+ if (gre_ports > 0)
+ return;
+
+ gre_cisco_unregister(&gre_protocol);
+}
+
+static const char *gre_get_name(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+static struct vport *gre_create(const struct vport_parms *parms)
+{
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct ovs_net *ovs_net;
+ struct vport *vport;
+ int err;
+
+ err = gre_init();
+ if (err)
+ return ERR_PTR(err);
+
+ ovs_net = net_generic(net, ovs_net_id);
+ if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
+ vport = ERR_PTR(-EEXIST);
+ goto error;
+ }
+
+ vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+ if (IS_ERR(vport))
+ goto error;
+
+ strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
+ rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
+ return vport;
+
+error:
+ gre_exit();
+ return vport;
+}
+
+static void gre_tnl_destroy(struct vport *vport)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct ovs_net *ovs_net;
+
+ ovs_net = net_generic(net, ovs_net_id);
+
+ RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL);
+ ovs_vport_deferred_free(vport);
+ gre_exit();
+}
+
+const struct vport_ops ovs_gre_vport_ops = {
+ .type = OVS_VPORT_TYPE_GRE,
+ .create = gre_create,
+ .destroy = gre_tnl_destroy,
+ .get_name = gre_get_name,
+ .send = gre_tnl_send,
+};
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 322b8d20669..789af9280e7 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -24,6 +24,9 @@
#include <linux/ethtool.h>
#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+
#include "datapath.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -60,21 +63,11 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde
return stats;
}
-static int internal_dev_mac_addr(struct net_device *dev, void *p)
-{
- struct sockaddr *addr = p;
-
- if (!is_valid_ether_addr(addr->sa_data))
- return -EADDRNOTAVAIL;
- memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
- return 0;
-}
-
/* Called with rcu_read_lock_bh. */
static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
rcu_read_lock();
- ovs_vport_receive(internal_dev_priv(netdev)->vport, skb);
+ ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
rcu_read_unlock();
return 0;
}
@@ -94,7 +87,7 @@ static int internal_dev_stop(struct net_device *netdev)
static void internal_dev_getinfo(struct net_device *netdev,
struct ethtool_drvinfo *info)
{
- strcpy(info->driver, "openvswitch");
+ strlcpy(info->driver, "openvswitch", sizeof(info->driver));
}
static const struct ethtool_ops internal_dev_ethtool_ops = {
@@ -123,7 +116,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
- .ndo_set_mac_address = internal_dev_mac_addr,
+ .ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_dev_get_stats,
};
@@ -135,17 +128,18 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
netdev->destructor = internal_dev_destructor;
- SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops);
+ netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->tx_queue_len = 0;
netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
- NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
+ NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
netdev->vlan_features = netdev->features;
- netdev->features |= NETIF_F_HW_VLAN_TX;
+ netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
- random_ether_addr(netdev->dev_addr);
+ eth_hw_addr_random(netdev);
}
static struct vport *internal_dev_create(const struct vport_parms *parms)
@@ -171,19 +165,27 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
goto error_free_vport;
}
+ dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp));
internal_dev = internal_dev_priv(netdev_vport->dev);
internal_dev->vport = vport;
+ /* Restrict bridge port to current netns. */
+ if (vport->port_no == OVSP_LOCAL)
+ netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL;
+
+ rtnl_lock();
err = register_netdevice(netdev_vport->dev);
if (err)
goto error_free_netdev;
dev_set_promiscuity(netdev_vport->dev, 1);
+ rtnl_unlock();
netif_start_queue(netdev_vport->dev);
return vport;
error_free_netdev:
+ rtnl_unlock();
free_netdev(netdev_vport->dev);
error_free_vport:
ovs_vport_free(vport);
@@ -196,10 +198,13 @@ static void internal_dev_destroy(struct vport *vport)
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
netif_stop_queue(netdev_vport->dev);
+ rtnl_lock();
dev_set_promiscuity(netdev_vport->dev, -1);
/* unregister_netdevice() waits for an RCU grace period. */
unregister_netdevice(netdev_vport->dev);
+
+ rtnl_unlock();
}
static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
@@ -208,9 +213,15 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
int len;
len = skb->len;
+
+ skb_dst_drop(skb);
+ nf_reset(skb);
+ secpath_reset(skb);
+
skb->dev = netdev;
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, netdev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
netif_rx(skb);
@@ -222,7 +233,6 @@ const struct vport_ops ovs_internal_vport_ops = {
.create = internal_dev_create,
.destroy = internal_dev_destroy,
.get_name = ovs_netdev_get_name,
- .get_ifindex = ovs_netdev_get_ifindex,
.send = internal_dev_recv,
};
diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h
index 3454447c5f1..9a7d30ecc6a 100644
--- a/net/openvswitch/vport-internal_dev.h
+++ b/net/openvswitch/vport-internal_dev.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2011 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index c1068aed03d..d21f77d875b 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -25,6 +25,7 @@
#include <linux/llc.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
+#include <linux/openvswitch.h>
#include <net/llc.h>
@@ -35,21 +36,27 @@
/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
- if (unlikely(!vport)) {
- kfree_skb(skb);
- return;
- }
+ if (unlikely(!vport))
+ goto error;
+
+ if (unlikely(skb_warn_if_lro(skb)))
+ goto error;
/* Make our own copy of the packet. Otherwise we will mangle the
* packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
- * (No one comes after us, since we tell handle_bridge() that we took
- * the packet.) */
+ */
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return;
skb_push(skb, ETH_HLEN);
- ovs_vport_receive(vport, skb);
+ ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+
+ ovs_vport_receive(vport, skb, NULL);
+ return;
+
+error:
+ kfree_skb(skb);
}
/* Called with rcu_read_lock and bottom-halves disabled. */
@@ -68,6 +75,15 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
return RX_HANDLER_CONSUMED;
}
+static struct net_device *get_dpdev(struct datapath *dp)
+{
+ struct vport *local;
+
+ local = ovs_vport_ovsl(dp, OVSP_LOCAL);
+ BUG_ON(!local);
+ return netdev_vport_priv(local)->dev;
+}
+
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
@@ -83,7 +99,7 @@ static struct vport *netdev_create(const struct vport_parms *parms)
netdev_vport = netdev_vport_priv(vport);
- netdev_vport->dev = dev_get_by_name(&init_net, parms->name);
+ netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
if (!netdev_vport->dev) {
err = -ENODEV;
goto error_free_vport;
@@ -96,16 +112,27 @@ static struct vport *netdev_create(const struct vport_parms *parms)
goto error_put;
}
+ rtnl_lock();
+ err = netdev_master_upper_dev_link(netdev_vport->dev,
+ get_dpdev(vport->dp));
+ if (err)
+ goto error_unlock;
+
err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
vport);
if (err)
- goto error_put;
+ goto error_master_upper_dev_unlink;
dev_set_promiscuity(netdev_vport->dev, 1);
netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+ rtnl_unlock();
return vport;
+error_master_upper_dev_unlink:
+ netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+error_unlock:
+ rtnl_unlock();
error_put:
dev_put(netdev_vport->dev);
error_free_vport:
@@ -114,18 +141,37 @@ error:
return ERR_PTR(err);
}
-static void netdev_destroy(struct vport *vport)
+static void free_port_rcu(struct rcu_head *rcu)
+{
+ struct netdev_vport *netdev_vport = container_of(rcu,
+ struct netdev_vport, rcu);
+
+ dev_put(netdev_vport->dev);
+ ovs_vport_free(vport_from_priv(netdev_vport));
+}
+
+void ovs_netdev_detach_dev(struct vport *vport)
{
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ ASSERT_RTNL();
netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
netdev_rx_handler_unregister(netdev_vport->dev);
+ netdev_upper_dev_unlink(netdev_vport->dev,
+ netdev_master_upper_dev_get(netdev_vport->dev));
dev_set_promiscuity(netdev_vport->dev, -1);
+}
+
+static void netdev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- synchronize_rcu();
+ rtnl_lock();
+ if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
+ rtnl_unlock();
- dev_put(netdev_vport->dev);
- ovs_vport_free(vport);
+ call_rcu(&netdev_vport->rcu, free_port_rcu);
}
const char *ovs_netdev_get_name(const struct vport *vport)
@@ -134,15 +180,9 @@ const char *ovs_netdev_get_name(const struct vport *vport)
return netdev_vport->dev->name;
}
-int ovs_netdev_get_ifindex(const struct vport *vport)
+static unsigned int packet_length(const struct sk_buff *skb)
{
- const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- return netdev_vport->dev->ifindex;
-}
-
-static unsigned packet_length(const struct sk_buff *skb)
-{
- unsigned length = skb->len - ETH_HLEN;
+ unsigned int length = skb->len - ETH_HLEN;
if (skb->protocol == htons(ETH_P_8021Q))
length -= VLAN_HLEN;
@@ -157,24 +197,20 @@ static int netdev_send(struct vport *vport, struct sk_buff *skb)
int len;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
- if (net_ratelimit())
- pr_warn("%s: dropped over-mtu packet: %d > %d\n",
- ovs_dp_name(vport->dp), packet_length(skb), mtu);
- goto error;
+ net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
+ netdev_vport->dev->name,
+ packet_length(skb), mtu);
+ goto drop;
}
- if (unlikely(skb_warn_if_lro(skb)))
- goto error;
-
skb->dev = netdev_vport->dev;
len = skb->len;
dev_queue_xmit(skb);
return len;
-error:
+drop:
kfree_skb(skb);
- ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
return 0;
}
@@ -193,6 +229,5 @@ const struct vport_ops ovs_netdev_vport_ops = {
.create = netdev_create,
.destroy = netdev_destroy,
.get_name = ovs_netdev_get_name,
- .get_ifindex = ovs_netdev_get_ifindex,
.send = netdev_send,
};
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index fd9b008a0e6..8df01c1127e 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2011 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -20,12 +20,15 @@
#define VPORT_NETDEV_H 1
#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
#include "vport.h"
struct vport *ovs_netdev_get_vport(struct net_device *dev);
struct netdev_vport {
+ struct rcu_head rcu;
+
struct net_device *dev;
};
@@ -36,7 +39,6 @@ netdev_vport_priv(const struct vport *vport)
}
const char *ovs_netdev_get_name(const struct vport *);
-const char *ovs_netdev_get_config(const struct vport *);
-int ovs_netdev_get_ifindex(const struct vport *);
+void ovs_netdev_detach_dev(struct vport *);
#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
new file mode 100644
index 00000000000..0edbd95c60e
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ * Copyright (c) 2013 Cisco Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/vxlan.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+/**
+ * struct vxlan_port - Keeps track of open UDP ports
+ * @vs: vxlan_sock created for the port.
+ * @name: vport name.
+ */
+struct vxlan_port {
+ struct vxlan_sock *vs;
+ char name[IFNAMSIZ];
+};
+
+static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
+{
+ struct ovs_key_ipv4_tunnel tun_key;
+ struct vport *vport = vs->data;
+ struct iphdr *iph;
+ __be64 key;
+
+ /* Save outer tunnel values */
+ iph = ip_hdr(skb);
+ key = cpu_to_be64(ntohl(vx_vni) >> 8);
+ ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+
+ ovs_vport_receive(vport, skb, &tun_key);
+}
+
+static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static void vxlan_tnl_destroy(struct vport *vport)
+{
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+
+ vxlan_sock_release(vxlan_port->vs);
+
+ ovs_vport_deferred_free(vport);
+}
+
+static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
+{
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct nlattr *options = parms->options;
+ struct vxlan_port *vxlan_port;
+ struct vxlan_sock *vs;
+ struct vport *vport;
+ struct nlattr *a;
+ u16 dst_port;
+ int err;
+
+ if (!options) {
+ err = -EINVAL;
+ goto error;
+ }
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+ if (a && nla_len(a) == sizeof(u16)) {
+ dst_port = nla_get_u16(a);
+ } else {
+ /* Require destination port from userspace. */
+ err = -EINVAL;
+ goto error;
+ }
+
+ vport = ovs_vport_alloc(sizeof(struct vxlan_port),
+ &ovs_vxlan_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ vxlan_port = vxlan_vport(vport);
+ strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
+
+ vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0);
+ if (IS_ERR(vs)) {
+ ovs_vport_free(vport);
+ return (void *)vs;
+ }
+ vxlan_port->vs = vs;
+
+ return vport;
+
+error:
+ return ERR_PTR(err);
+}
+
+static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+ struct rtable *rt;
+ struct flowi4 fl;
+ __be16 src_port;
+ int port_min;
+ int port_max;
+ __be16 df;
+ int err;
+
+ if (unlikely(!OVS_CB(skb)->tun_key)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ /* Route lookup */
+ memset(&fl, 0, sizeof(fl));
+ fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+ fl.saddr = OVS_CB(skb)->tun_key->ipv4_src;
+ fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos);
+ fl.flowi4_mark = skb->mark;
+ fl.flowi4_proto = IPPROTO_UDP;
+
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto error;
+ }
+
+ df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+ htons(IP_DF) : 0;
+
+ skb->ignore_df = 1;
+
+ inet_get_local_port_range(net, &port_min, &port_max);
+ src_port = vxlan_src_port(port_min, port_max, skb);
+
+ err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
+ fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst,
+ OVS_CB(skb)->tun_key->ipv4_tos,
+ OVS_CB(skb)->tun_key->ipv4_ttl, df,
+ src_port, dst_port,
+ htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8),
+ false);
+ if (err < 0)
+ ip_rt_put(rt);
+error:
+ return err;
+}
+
+static const char *vxlan_get_name(const struct vport *vport)
+{
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ return vxlan_port->name;
+}
+
+const struct vport_ops ovs_vxlan_vport_ops = {
+ .type = OVS_VPORT_TYPE_VXLAN,
+ .create = vxlan_tnl_create,
+ .destroy = vxlan_tnl_destroy,
+ .get_name = vxlan_get_name,
+ .get_options = vxlan_get_options,
+ .send = vxlan_tnl_send,
+};
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6c066ba25dc..42c0f4a0b78 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -16,10 +16,10 @@
* 02110-1301, USA
*/
-#include <linux/dcache.h>
#include <linux/etherdevice.h>
#include <linux/if.h>
#include <linux/if_vlan.h>
+#include <linux/jhash.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -27,18 +27,30 @@
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>
#include <linux/compat.h>
+#include <net/net_namespace.h>
+#include "datapath.h"
#include "vport.h"
#include "vport-internal_dev.h"
+static void ovs_vport_record_error(struct vport *,
+ enum vport_err_type err_type);
+
/* List of statically compiled vport implementations. Don't forget to also
* add yours to the list at the bottom of vport.h. */
static const struct vport_ops *vport_ops_list[] = {
&ovs_netdev_vport_ops,
&ovs_internal_vport_ops,
+
+#ifdef CONFIG_OPENVSWITCH_GRE
+ &ovs_gre_vport_ops,
+#endif
+#ifdef CONFIG_OPENVSWITCH_VXLAN
+ &ovs_vxlan_vport_ops,
+#endif
};
-/* Protected by RCU read lock for reading, RTNL lock for writing. */
+/* Protected by RCU read lock for reading, ovs_mutex for writing. */
static struct hlist_head *dev_table;
#define VPORT_HASH_BUCKETS 1024
@@ -67,9 +79,9 @@ void ovs_vport_exit(void)
kfree(dev_table);
}
-static struct hlist_head *hash_bucket(const char *name)
+static struct hlist_head *hash_bucket(struct net *net, const char *name)
{
- unsigned int hash = full_name_hash(name, strlen(name));
+ unsigned int hash = jhash(name, strlen(name), (unsigned long) net);
return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
}
@@ -78,16 +90,16 @@ static struct hlist_head *hash_bucket(const char *name)
*
* @name: name of port to find
*
- * Must be called with RTNL or RCU read lock.
+ * Must be called with ovs or RCU read lock.
*/
-struct vport *ovs_vport_locate(const char *name)
+struct vport *ovs_vport_locate(struct net *net, const char *name)
{
- struct hlist_head *bucket = hash_bucket(name);
+ struct hlist_head *bucket = hash_bucket(net, name);
struct vport *vport;
- struct hlist_node *node;
- hlist_for_each_entry_rcu(vport, node, bucket, hash_node)
- if (!strcmp(name, vport->ops->get_name(vport)))
+ hlist_for_each_entry_rcu(vport, bucket, hash_node)
+ if (!strcmp(name, vport->ops->get_name(vport)) &&
+ net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
return NULL;
@@ -122,10 +134,11 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
vport->dp = parms->dp;
vport->port_no = parms->port_no;
- vport->upcall_pid = parms->upcall_pid;
+ vport->upcall_portid = parms->upcall_portid;
vport->ops = ops;
+ INIT_HLIST_NODE(&vport->dp_hash_node);
- vport->percpu_stats = alloc_percpu(struct vport_percpu_stats);
+ vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!vport->percpu_stats) {
kfree(vport);
return ERR_PTR(-ENOMEM);
@@ -158,7 +171,7 @@ void ovs_vport_free(struct vport *vport)
* @parms: Information about new vport.
*
* Creates a new vport with the specified configuration (which is dependent on
- * device type). RTNL lock must be held.
+ * device type). ovs_mutex must be held.
*/
struct vport *ovs_vport_add(const struct vport_parms *parms)
{
@@ -166,18 +179,19 @@ struct vport *ovs_vport_add(const struct vport_parms *parms)
int err = 0;
int i;
- ASSERT_RTNL();
-
for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) {
if (vport_ops_list[i]->type == parms->type) {
+ struct hlist_head *bucket;
+
vport = vport_ops_list[i]->create(parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto out;
}
- hlist_add_head_rcu(&vport->hash_node,
- hash_bucket(vport->ops->get_name(vport)));
+ bucket = hash_bucket(ovs_dp_get_net(vport->dp),
+ vport->ops->get_name(vport));
+ hlist_add_head_rcu(&vport->hash_node, bucket);
return vport;
}
}
@@ -192,15 +206,13 @@ out:
* ovs_vport_set_options - modify existing vport device (for kernel callers)
*
* @vport: vport to modify.
- * @port: New configuration.
+ * @options: New configuration.
*
* Modifies an existing device with the specified configuration (which is
- * dependent on device type). RTNL lock must be held.
+ * dependent on device type). ovs_mutex must be held.
*/
int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
{
- ASSERT_RTNL();
-
if (!vport->ops->set_options)
return -EOPNOTSUPP;
return vport->ops->set_options(vport, options);
@@ -212,11 +224,11 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
* @vport: vport to delete.
*
* Detaches @vport from its datapath and destroys it. It is possible to fail
- * for reasons such as lack of memory. RTNL lock must be held.
+ * for reasons such as lack of memory. ovs_mutex must be held.
*/
void ovs_vport_del(struct vport *vport)
{
- ASSERT_RTNL();
+ ASSERT_OVSL();
hlist_del_rcu(&vport->hash_node);
@@ -231,7 +243,7 @@ void ovs_vport_del(struct vport *vport)
*
* Retrieves transmit, receive, and error stats for the given device.
*
- * Must be called with RTNL lock or rcu_read_lock.
+ * Must be called with ovs_mutex or rcu_read_lock.
*/
void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
{
@@ -258,16 +270,16 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
spin_unlock_bh(&vport->stats_lock);
for_each_possible_cpu(i) {
- const struct vport_percpu_stats *percpu_stats;
- struct vport_percpu_stats local_stats;
+ const struct pcpu_sw_netstats *percpu_stats;
+ struct pcpu_sw_netstats local_stats;
unsigned int start;
percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
do {
- start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
+ start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
+ } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
stats->rx_bytes += local_stats.rx_bytes;
stats->rx_packets += local_stats.rx_packets;
@@ -290,22 +302,24 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
* negative error code if a real error occurred. If an error occurs, @skb is
* left unmodified.
*
- * Must be called with RTNL lock or rcu_read_lock.
+ * Must be called with ovs_mutex or rcu_read_lock.
*/
int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
{
struct nlattr *nla;
+ int err;
+
+ if (!vport->ops->get_options)
+ return 0;
nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS);
if (!nla)
return -EMSGSIZE;
- if (vport->ops->get_options) {
- int err = vport->ops->get_options(vport, skb);
- if (err) {
- nla_nest_cancel(skb, nla);
- return err;
- }
+ err = vport->ops->get_options(vport, skb);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
}
nla_nest_end(skb, nla);
@@ -317,22 +331,23 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
*
* @vport: vport that received the packet
* @skb: skb that was received
+ * @tun_key: tunnel (if any) that carried packet
*
* Must be called with rcu_read_lock. The packet cannot be shared and
- * skb->data should point to the Ethernet header. The caller must have already
- * called compute_ip_summed() to initialize the checksumming fields.
+ * skb->data should point to the Ethernet header.
*/
-void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
+void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
+ struct ovs_key_ipv4_tunnel *tun_key)
{
- struct vport_percpu_stats *stats;
-
- stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
+ struct pcpu_sw_netstats *stats;
- u64_stats_update_begin(&stats->sync);
+ stats = this_cpu_ptr(vport->percpu_stats);
+ u64_stats_update_begin(&stats->syncp);
stats->rx_packets++;
stats->rx_bytes += skb->len;
- u64_stats_update_end(&stats->sync);
+ u64_stats_update_end(&stats->syncp);
+ OVS_CB(skb)->tun_key = tun_key;
ovs_dp_process_received_packet(vport, skb);
}
@@ -342,23 +357,28 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
* @vport: vport on which to send the packet
* @skb: skb to send
*
- * Sends the given packet and returns the length of data sent. Either RTNL
+ * Sends the given packet and returns the length of data sent. Either ovs
* lock or rcu_read_lock must be held.
*/
int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
int sent = vport->ops->send(vport, skb);
- if (likely(sent)) {
- struct vport_percpu_stats *stats;
+ if (likely(sent > 0)) {
+ struct pcpu_sw_netstats *stats;
- stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
+ stats = this_cpu_ptr(vport->percpu_stats);
- u64_stats_update_begin(&stats->sync);
+ u64_stats_update_begin(&stats->syncp);
stats->tx_packets++;
stats->tx_bytes += sent;
- u64_stats_update_end(&stats->sync);
- }
+ u64_stats_update_end(&stats->syncp);
+ } else if (sent < 0) {
+ ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
+ kfree_skb(skb);
+ } else
+ ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
+
return sent;
}
@@ -369,9 +389,10 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
* @err_type: one of enum vport_err_type types to indicate the error type
*
* If using the vport generic stats layer indicate that an error of the given
- * type has occured.
+ * type has occurred.
*/
-void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
+static void ovs_vport_record_error(struct vport *vport,
+ enum vport_err_type err_type)
{
spin_lock(&vport->stats_lock);
@@ -391,7 +412,22 @@ void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
case VPORT_E_TX_ERROR:
vport->err_stats.tx_errors++;
break;
- };
+ }
spin_unlock(&vport->stats_lock);
}
+
+static void free_vport_rcu(struct rcu_head *rcu)
+{
+ struct vport *vport = container_of(rcu, struct vport, rcu);
+
+ ovs_vport_free(vport);
+}
+
+void ovs_vport_deferred_free(struct vport *vport)
+{
+ if (!vport)
+ return;
+
+ call_rcu(&vport->rcu, free_vport_rcu);
+}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 19609629dab..8d721e62f38 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -19,7 +19,9 @@
#ifndef VPORT_H
#define VPORT_H 1
+#include <linux/if_tunnel.h>
#include <linux/list.h>
+#include <linux/netlink.h>
#include <linux/openvswitch.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
@@ -32,13 +34,18 @@ struct vport_parms;
/* The following definitions are for users of the vport subsytem: */
+/* The following definitions are for users of the vport subsytem: */
+struct vport_net {
+ struct vport __rcu *gre_vport;
+};
+
int ovs_vport_init(void);
void ovs_vport_exit(void);
struct vport *ovs_vport_add(const struct vport_parms *);
void ovs_vport_del(struct vport *);
-struct vport *ovs_vport_locate(const char *name);
+struct vport *ovs_vport_locate(struct net *net, const char *name);
void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *);
@@ -49,14 +56,6 @@ int ovs_vport_send(struct vport *, struct sk_buff *);
/* The following definitions are for implementers of vport devices: */
-struct vport_percpu_stats {
- u64 rx_bytes;
- u64 rx_packets;
- u64 tx_bytes;
- u64 tx_packets;
- struct u64_stats_sync sync;
-};
-
struct vport_err_stats {
u64 rx_dropped;
u64 rx_errors;
@@ -67,12 +66,12 @@ struct vport_err_stats {
/**
* struct vport - one port within a datapath
* @rcu: RCU callback head for deferred destruction.
- * @port_no: Index into @dp's @ports array.
* @dp: Datapath to which this port belongs.
- * @node: Element in @dp's @port_list.
- * @upcall_pid: The Netlink port to use for packets received on this port that
+ * @upcall_portid: The Netlink port to use for packets received on this port that
* miss the flow table.
+ * @port_no: Index into @dp's @ports array.
* @hash_node: Element in @dev_table hash table in vport.c.
+ * @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
* @percpu_stats: Points to per-CPU statistics used and maintained by vport
* @stats_lock: Protects @err_stats;
@@ -80,15 +79,15 @@ struct vport_err_stats {
*/
struct vport {
struct rcu_head rcu;
- u16 port_no;
struct datapath *dp;
- struct list_head node;
- u32 upcall_pid;
+ u32 upcall_portid;
+ u16 port_no;
struct hlist_node hash_node;
+ struct hlist_node dp_hash_node;
const struct vport_ops *ops;
- struct vport_percpu_stats __percpu *percpu_stats;
+ struct pcpu_sw_netstats __percpu *percpu_stats;
spinlock_t stats_lock;
struct vport_err_stats err_stats;
@@ -112,7 +111,7 @@ struct vport_parms {
/* For ovs_vport_alloc(). */
struct datapath *dp;
u16 port_no;
- u32 upcall_pid;
+ u32 upcall_portid;
};
/**
@@ -129,25 +128,21 @@ struct vport_parms {
* existing vport to a &struct sk_buff. May be %NULL for a vport that does not
* have any configuration.
* @get_name: Get the device's name.
- * @get_config: Get the device's configuration.
- * @get_ifindex: Get the system interface index associated with the device.
- * May be null if the device does not have an ifindex.
- * @send: Send a packet on the device. Returns the length of the packet sent.
+ * @send: Send a packet on the device. Returns the length of the packet sent,
+ * zero for dropped packets or negative for error.
*/
struct vport_ops {
enum ovs_vport_type type;
- /* Called with RTNL lock. */
+ /* Called with ovs_mutex. */
struct vport *(*create)(const struct vport_parms *);
void (*destroy)(struct vport *);
int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);
- /* Called with rcu_read_lock or RTNL lock. */
+ /* Called with rcu_read_lock or ovs_mutex. */
const char *(*get_name)(const struct vport *);
- void (*get_config)(const struct vport *, void *);
- int (*get_ifindex)(const struct vport *);
int (*send)(struct vport *, struct sk_buff *);
};
@@ -162,6 +157,7 @@ enum vport_err_type {
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
+void ovs_vport_deferred_free(struct vport *vport);
#define VPORT_ALIGN 8
@@ -176,7 +172,7 @@ void ovs_vport_free(struct vport *);
*/
static inline void *vport_priv(const struct vport *vport)
{
- return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN);
+ return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN);
}
/**
@@ -189,17 +185,26 @@ static inline void *vport_priv(const struct vport *vport)
* the result of a hash table lookup. @priv must point to the start of the
* private data area.
*/
-static inline struct vport *vport_from_priv(const void *priv)
+static inline struct vport *vport_from_priv(void *priv)
{
- return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
+ return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
}
-void ovs_vport_receive(struct vport *, struct sk_buff *);
-void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
+void ovs_vport_receive(struct vport *, struct sk_buff *,
+ struct ovs_key_ipv4_tunnel *);
/* List of statically compiled vport implementations. Don't forget to also
* add yours to the list at the top of vport.c. */
extern const struct vport_ops ovs_netdev_vport_ops;
extern const struct vport_ops ovs_internal_vport_ops;
+extern const struct vport_ops ovs_gre_vport_ops;
+extern const struct vport_ops ovs_vxlan_vport_ops;
+
+static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
+ const void *start, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
+}
#endif /* vport.h */