diff options
Diffstat (limited to 'net/openvswitch')
| -rw-r--r-- | net/openvswitch/Kconfig | 56 | ||||
| -rw-r--r-- | net/openvswitch/Makefile | 24 | ||||
| -rw-r--r-- | net/openvswitch/actions.c | 584 | ||||
| -rw-r--r-- | net/openvswitch/datapath.c | 2104 | ||||
| -rw-r--r-- | net/openvswitch/datapath.h | 202 | ||||
| -rw-r--r-- | net/openvswitch/dp_notify.c | 102 | ||||
| -rw-r--r-- | net/openvswitch/flow.c | 615 | ||||
| -rw-r--r-- | net/openvswitch/flow.h | 192 | ||||
| -rw-r--r-- | net/openvswitch/flow_netlink.c | 1576 | ||||
| -rw-r--r-- | net/openvswitch/flow_netlink.h | 60 | ||||
| -rw-r--r-- | net/openvswitch/flow_table.c | 647 | ||||
| -rw-r--r-- | net/openvswitch/flow_table.h | 86 | ||||
| -rw-r--r-- | net/openvswitch/vport-gre.c | 287 | ||||
| -rw-r--r-- | net/openvswitch/vport-internal_dev.c | 250 | ||||
| -rw-r--r-- | net/openvswitch/vport-internal_dev.h | 28 | ||||
| -rw-r--r-- | net/openvswitch/vport-netdev.c | 233 | ||||
| -rw-r--r-- | net/openvswitch/vport-netdev.h | 44 | ||||
| -rw-r--r-- | net/openvswitch/vport-vxlan.c | 204 | ||||
| -rw-r--r-- | net/openvswitch/vport.c | 433 | ||||
| -rw-r--r-- | net/openvswitch/vport.h | 210 | 
20 files changed, 7937 insertions, 0 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig new file mode 100644 index 00000000000..6ecf491ad50 --- /dev/null +++ b/net/openvswitch/Kconfig @@ -0,0 +1,56 @@ +# +# Open vSwitch +# + +config OPENVSWITCH +	tristate "Open vSwitch" +	select LIBCRC32C +	---help--- +	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized +	  environments.  In addition to supporting a variety of features +	  expected in a traditional hardware switch, it enables fine-grained +	  programmatic extension and flow-based control of the network.  This +	  control is useful in a wide variety of applications but is +	  particularly important in multi-server virtualization deployments, +	  which are often characterized by highly dynamic endpoints and the +	  need to maintain logical abstractions for multiple tenants. + +	  The Open vSwitch datapath provides an in-kernel fast path for packet +	  forwarding.  It is complemented by a userspace daemon, ovs-vswitchd, +	  which is able to accept configuration from a variety of sources and +	  translate it into packet processing rules. + +	  See http://openvswitch.org for more information and userspace +	  utilities. + +	  To compile this code as a module, choose M here: the module will be +	  called openvswitch. + +	  If unsure, say N. + +config OPENVSWITCH_GRE +	bool "Open vSwitch GRE tunneling support" +	depends on INET +	depends on OPENVSWITCH +	depends on NET_IPGRE_DEMUX && !(OPENVSWITCH=y && NET_IPGRE_DEMUX=m) +	default y +	---help--- +	  If you say Y here, then the Open vSwitch will be able create GRE +	  vport. + +	  Say N to exclude this support and reduce the binary size. + +	  If unsure, say Y. + +config OPENVSWITCH_VXLAN +	bool "Open vSwitch VXLAN tunneling support" +	depends on INET +	depends on OPENVSWITCH +	depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m) +	default y +	---help--- +	  If you say Y here, then the Open vSwitch will be able create vxlan vport. + +	  Say N to exclude this support and reduce the binary size. + +	  If unsure, say Y. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile new file mode 100644 index 00000000000..3591cb5dae9 --- /dev/null +++ b/net/openvswitch/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for Open vSwitch. +# + +obj-$(CONFIG_OPENVSWITCH) += openvswitch.o + +openvswitch-y := \ +	actions.o \ +	datapath.o \ +	dp_notify.o \ +	flow.o \ +	flow_netlink.o \ +	flow_table.o \ +	vport.o \ +	vport-internal_dev.o \ +	vport-netdev.o + +ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) +openvswitch-y += vport-vxlan.o +endif + +ifneq ($(CONFIG_OPENVSWITCH_GRE),) +openvswitch-y += vport-gre.o +endif diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c new file mode 100644 index 00000000000..e70d8b18e96 --- /dev/null +++ b/net/openvswitch/actions.c @@ -0,0 +1,584 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/openvswitch.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/in6.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/sctp/checksum.h> + +#include "datapath.h" +#include "vport.h" + +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, +			const struct nlattr *attr, int len, bool keep_skb); + +static int make_writable(struct sk_buff *skb, int write_len) +{ +	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) +		return 0; + +	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/* remove VLAN header from packet and update csum accordingly. */ +static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) +{ +	struct vlan_hdr *vhdr; +	int err; + +	err = make_writable(skb, VLAN_ETH_HLEN); +	if (unlikely(err)) +		return err; + +	if (skb->ip_summed == CHECKSUM_COMPLETE) +		skb->csum = csum_sub(skb->csum, csum_partial(skb->data +					+ (2 * ETH_ALEN), VLAN_HLEN, 0)); + +	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); +	*current_tci = vhdr->h_vlan_TCI; + +	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); +	__skb_pull(skb, VLAN_HLEN); + +	vlan_set_encap_proto(skb, vhdr); +	skb->mac_header += VLAN_HLEN; +	skb_reset_mac_len(skb); + +	return 0; +} + +static int pop_vlan(struct sk_buff *skb) +{ +	__be16 tci; +	int err; + +	if (likely(vlan_tx_tag_present(skb))) { +		skb->vlan_tci = 0; +	} else { +		if (unlikely(skb->protocol != htons(ETH_P_8021Q) || +			     skb->len < VLAN_ETH_HLEN)) +			return 0; + +		err = __pop_vlan_tci(skb, &tci); +		if (err) +			return err; +	} +	/* move next vlan tag to hw accel tag */ +	if (likely(skb->protocol != htons(ETH_P_8021Q) || +		   skb->len < VLAN_ETH_HLEN)) +		return 0; + +	err = __pop_vlan_tci(skb, &tci); +	if (unlikely(err)) +		return err; + +	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); +	return 0; +} + +static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) +{ +	if (unlikely(vlan_tx_tag_present(skb))) { +		u16 current_tag; + +		/* push down current VLAN tag */ +		current_tag = vlan_tx_tag_get(skb); + +		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) +			return -ENOMEM; + +		if (skb->ip_summed == CHECKSUM_COMPLETE) +			skb->csum = csum_add(skb->csum, csum_partial(skb->data +					+ (2 * ETH_ALEN), VLAN_HLEN, 0)); + +	} +	__vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); +	return 0; +} + +static int set_eth_addr(struct sk_buff *skb, +			const struct ovs_key_ethernet *eth_key) +{ +	int err; +	err = make_writable(skb, ETH_HLEN); +	if (unlikely(err)) +		return err; + +	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + +	ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src); +	ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst); + +	ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + +	return 0; +} + +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, +				__be32 *addr, __be32 new_addr) +{ +	int transport_len = skb->len - skb_transport_offset(skb); + +	if (nh->protocol == IPPROTO_TCP) { +		if (likely(transport_len >= sizeof(struct tcphdr))) +			inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, +						 *addr, new_addr, 1); +	} else if (nh->protocol == IPPROTO_UDP) { +		if (likely(transport_len >= sizeof(struct udphdr))) { +			struct udphdr *uh = udp_hdr(skb); + +			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { +				inet_proto_csum_replace4(&uh->check, skb, +							 *addr, new_addr, 1); +				if (!uh->check) +					uh->check = CSUM_MANGLED_0; +			} +		} +	} + +	csum_replace4(&nh->check, *addr, new_addr); +	skb_clear_hash(skb); +	*addr = new_addr; +} + +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, +				 __be32 addr[4], const __be32 new_addr[4]) +{ +	int transport_len = skb->len - skb_transport_offset(skb); + +	if (l4_proto == IPPROTO_TCP) { +		if (likely(transport_len >= sizeof(struct tcphdr))) +			inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, +						  addr, new_addr, 1); +	} else if (l4_proto == IPPROTO_UDP) { +		if (likely(transport_len >= sizeof(struct udphdr))) { +			struct udphdr *uh = udp_hdr(skb); + +			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { +				inet_proto_csum_replace16(&uh->check, skb, +							  addr, new_addr, 1); +				if (!uh->check) +					uh->check = CSUM_MANGLED_0; +			} +		} +	} +} + +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, +			  __be32 addr[4], const __be32 new_addr[4], +			  bool recalculate_csum) +{ +	if (recalculate_csum) +		update_ipv6_checksum(skb, l4_proto, addr, new_addr); + +	skb_clear_hash(skb); +	memcpy(addr, new_addr, sizeof(__be32[4])); +} + +static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) +{ +	nh->priority = tc >> 4; +	nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); +} + +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) +{ +	nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; +	nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; +	nh->flow_lbl[2] = fl & 0x000000FF; +} + +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) +{ +	csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); +	nh->ttl = new_ttl; +} + +static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) +{ +	struct iphdr *nh; +	int err; + +	err = make_writable(skb, skb_network_offset(skb) + +				 sizeof(struct iphdr)); +	if (unlikely(err)) +		return err; + +	nh = ip_hdr(skb); + +	if (ipv4_key->ipv4_src != nh->saddr) +		set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); + +	if (ipv4_key->ipv4_dst != nh->daddr) +		set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); + +	if (ipv4_key->ipv4_tos != nh->tos) +		ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); + +	if (ipv4_key->ipv4_ttl != nh->ttl) +		set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); + +	return 0; +} + +static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) +{ +	struct ipv6hdr *nh; +	int err; +	__be32 *saddr; +	__be32 *daddr; + +	err = make_writable(skb, skb_network_offset(skb) + +			    sizeof(struct ipv6hdr)); +	if (unlikely(err)) +		return err; + +	nh = ipv6_hdr(skb); +	saddr = (__be32 *)&nh->saddr; +	daddr = (__be32 *)&nh->daddr; + +	if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) +		set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, +			      ipv6_key->ipv6_src, true); + +	if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { +		unsigned int offset = 0; +		int flags = IP6_FH_F_SKIP_RH; +		bool recalc_csum = true; + +		if (ipv6_ext_hdr(nh->nexthdr)) +			recalc_csum = ipv6_find_hdr(skb, &offset, +						    NEXTHDR_ROUTING, NULL, +						    &flags) != NEXTHDR_ROUTING; + +		set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, +			      ipv6_key->ipv6_dst, recalc_csum); +	} + +	set_ipv6_tc(nh, ipv6_key->ipv6_tclass); +	set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); +	nh->hop_limit = ipv6_key->ipv6_hlimit; + +	return 0; +} + +/* Must follow make_writable() since that can move the skb data. */ +static void set_tp_port(struct sk_buff *skb, __be16 *port, +			 __be16 new_port, __sum16 *check) +{ +	inet_proto_csum_replace2(check, skb, *port, new_port, 0); +	*port = new_port; +	skb_clear_hash(skb); +} + +static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) +{ +	struct udphdr *uh = udp_hdr(skb); + +	if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { +		set_tp_port(skb, port, new_port, &uh->check); + +		if (!uh->check) +			uh->check = CSUM_MANGLED_0; +	} else { +		*port = new_port; +		skb_clear_hash(skb); +	} +} + +static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) +{ +	struct udphdr *uh; +	int err; + +	err = make_writable(skb, skb_transport_offset(skb) + +				 sizeof(struct udphdr)); +	if (unlikely(err)) +		return err; + +	uh = udp_hdr(skb); +	if (udp_port_key->udp_src != uh->source) +		set_udp_port(skb, &uh->source, udp_port_key->udp_src); + +	if (udp_port_key->udp_dst != uh->dest) +		set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); + +	return 0; +} + +static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) +{ +	struct tcphdr *th; +	int err; + +	err = make_writable(skb, skb_transport_offset(skb) + +				 sizeof(struct tcphdr)); +	if (unlikely(err)) +		return err; + +	th = tcp_hdr(skb); +	if (tcp_port_key->tcp_src != th->source) +		set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); + +	if (tcp_port_key->tcp_dst != th->dest) +		set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); + +	return 0; +} + +static int set_sctp(struct sk_buff *skb, +		     const struct ovs_key_sctp *sctp_port_key) +{ +	struct sctphdr *sh; +	int err; +	unsigned int sctphoff = skb_transport_offset(skb); + +	err = make_writable(skb, sctphoff + sizeof(struct sctphdr)); +	if (unlikely(err)) +		return err; + +	sh = sctp_hdr(skb); +	if (sctp_port_key->sctp_src != sh->source || +	    sctp_port_key->sctp_dst != sh->dest) { +		__le32 old_correct_csum, new_csum, old_csum; + +		old_csum = sh->checksum; +		old_correct_csum = sctp_compute_cksum(skb, sctphoff); + +		sh->source = sctp_port_key->sctp_src; +		sh->dest = sctp_port_key->sctp_dst; + +		new_csum = sctp_compute_cksum(skb, sctphoff); + +		/* Carry any checksum errors through. */ +		sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + +		skb_clear_hash(skb); +	} + +	return 0; +} + +static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +{ +	struct vport *vport; + +	if (unlikely(!skb)) +		return -ENOMEM; + +	vport = ovs_vport_rcu(dp, out_port); +	if (unlikely(!vport)) { +		kfree_skb(skb); +		return -ENODEV; +	} + +	ovs_vport_send(vport, skb); +	return 0; +} + +static int output_userspace(struct datapath *dp, struct sk_buff *skb, +			    const struct nlattr *attr) +{ +	struct dp_upcall_info upcall; +	const struct nlattr *a; +	int rem; + +	BUG_ON(!OVS_CB(skb)->pkt_key); + +	upcall.cmd = OVS_PACKET_CMD_ACTION; +	upcall.key = OVS_CB(skb)->pkt_key; +	upcall.userdata = NULL; +	upcall.portid = 0; + +	for (a = nla_data(attr), rem = nla_len(attr); rem > 0; +		 a = nla_next(a, &rem)) { +		switch (nla_type(a)) { +		case OVS_USERSPACE_ATTR_USERDATA: +			upcall.userdata = a; +			break; + +		case OVS_USERSPACE_ATTR_PID: +			upcall.portid = nla_get_u32(a); +			break; +		} +	} + +	return ovs_dp_upcall(dp, skb, &upcall); +} + +static int sample(struct datapath *dp, struct sk_buff *skb, +		  const struct nlattr *attr) +{ +	const struct nlattr *acts_list = NULL; +	const struct nlattr *a; +	int rem; + +	for (a = nla_data(attr), rem = nla_len(attr); rem > 0; +		 a = nla_next(a, &rem)) { +		switch (nla_type(a)) { +		case OVS_SAMPLE_ATTR_PROBABILITY: +			if (prandom_u32() >= nla_get_u32(a)) +				return 0; +			break; + +		case OVS_SAMPLE_ATTR_ACTIONS: +			acts_list = a; +			break; +		} +	} + +	return do_execute_actions(dp, skb, nla_data(acts_list), +						 nla_len(acts_list), true); +} + +static int execute_set_action(struct sk_buff *skb, +				 const struct nlattr *nested_attr) +{ +	int err = 0; + +	switch (nla_type(nested_attr)) { +	case OVS_KEY_ATTR_PRIORITY: +		skb->priority = nla_get_u32(nested_attr); +		break; + +	case OVS_KEY_ATTR_SKB_MARK: +		skb->mark = nla_get_u32(nested_attr); +		break; + +	case OVS_KEY_ATTR_IPV4_TUNNEL: +		OVS_CB(skb)->tun_key = nla_data(nested_attr); +		break; + +	case OVS_KEY_ATTR_ETHERNET: +		err = set_eth_addr(skb, nla_data(nested_attr)); +		break; + +	case OVS_KEY_ATTR_IPV4: +		err = set_ipv4(skb, nla_data(nested_attr)); +		break; + +	case OVS_KEY_ATTR_IPV6: +		err = set_ipv6(skb, nla_data(nested_attr)); +		break; + +	case OVS_KEY_ATTR_TCP: +		err = set_tcp(skb, nla_data(nested_attr)); +		break; + +	case OVS_KEY_ATTR_UDP: +		err = set_udp(skb, nla_data(nested_attr)); +		break; + +	case OVS_KEY_ATTR_SCTP: +		err = set_sctp(skb, nla_data(nested_attr)); +		break; +	} + +	return err; +} + +/* Execute a list of actions against 'skb'. */ +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, +			const struct nlattr *attr, int len, bool keep_skb) +{ +	/* Every output action needs a separate clone of 'skb', but the common +	 * case is just a single output action, so that doing a clone and +	 * then freeing the original skbuff is wasteful.  So the following code +	 * is slightly obscure just to avoid that. */ +	int prev_port = -1; +	const struct nlattr *a; +	int rem; + +	for (a = attr, rem = len; rem > 0; +	     a = nla_next(a, &rem)) { +		int err = 0; + +		if (prev_port != -1) { +			do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); +			prev_port = -1; +		} + +		switch (nla_type(a)) { +		case OVS_ACTION_ATTR_OUTPUT: +			prev_port = nla_get_u32(a); +			break; + +		case OVS_ACTION_ATTR_USERSPACE: +			output_userspace(dp, skb, a); +			break; + +		case OVS_ACTION_ATTR_PUSH_VLAN: +			err = push_vlan(skb, nla_data(a)); +			if (unlikely(err)) /* skb already freed. */ +				return err; +			break; + +		case OVS_ACTION_ATTR_POP_VLAN: +			err = pop_vlan(skb); +			break; + +		case OVS_ACTION_ATTR_SET: +			err = execute_set_action(skb, nla_data(a)); +			break; + +		case OVS_ACTION_ATTR_SAMPLE: +			err = sample(dp, skb, a); +			if (unlikely(err)) /* skb already freed. */ +				return err; +			break; +		} + +		if (unlikely(err)) { +			kfree_skb(skb); +			return err; +		} +	} + +	if (prev_port != -1) { +		if (keep_skb) +			skb = skb_clone(skb, GFP_ATOMIC); + +		do_output(dp, skb, prev_port); +	} else if (!keep_skb) +		consume_skb(skb); + +	return 0; +} + +/* Execute a list of actions against 'skb'. */ +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) +{ +	struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + +	OVS_CB(skb)->tun_key = NULL; +	return do_execute_actions(dp, skb, acts->actions, +					 acts->actions_len, false); +} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c new file mode 100644 index 00000000000..9db4bf6740d --- /dev/null +++ b/net/openvswitch/datapath.c @@ -0,0 +1,2104 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/time.h> +#include <linux/etherdevice.h> +#include <linux/genetlink.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/ethtool.h> +#include <linux/wait.h> +#include <asm/div64.h> +#include <linux/highmem.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv4.h> +#include <linux/inetdevice.h> +#include <linux/list.h> +#include <linux/openvswitch.h> +#include <linux/rculist.h> +#include <linux/dmi.h> +#include <linux/genetlink.h> +#include <net/genetlink.h> +#include <net/genetlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include "datapath.h" +#include "flow.h" +#include "flow_table.h" +#include "flow_netlink.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +int ovs_net_id __read_mostly; + +static struct genl_family dp_packet_genl_family; +static struct genl_family dp_flow_genl_family; +static struct genl_family dp_datapath_genl_family; + +static struct genl_multicast_group ovs_dp_flow_multicast_group = { +	.name = OVS_FLOW_MCGROUP +}; + +static struct genl_multicast_group ovs_dp_datapath_multicast_group = { +	.name = OVS_DATAPATH_MCGROUP +}; + +struct genl_multicast_group ovs_dp_vport_multicast_group = { +	.name = OVS_VPORT_MCGROUP +}; + +/* Check if need to build a reply message. + * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ +static bool ovs_must_notify(struct genl_info *info, +			    const struct genl_multicast_group *grp) +{ +	return info->nlhdr->nlmsg_flags & NLM_F_ECHO || +		netlink_has_listeners(genl_info_net(info)->genl_sock, 0); +} + +static void ovs_notify(struct genl_family *family, +		       struct sk_buff *skb, struct genl_info *info) +{ +	genl_notify(family, skb, genl_info_net(info), info->snd_portid, +		    0, info->nlhdr, GFP_KERNEL); +} + +/** + * DOC: Locking: + * + * All writes e.g. Writes to device state (add/remove datapath, port, set + * operations on vports, etc.), Writes to other state (flow table + * modifications, set miscellaneous datapath parameters, etc.) are protected + * by ovs_lock. + * + * Reads are protected by RCU. + * + * There are a few special cases (mostly stats) that have their own + * synchronization but they nest under all of above and don't interact with + * each other. + * + * The RTNL lock nests inside ovs_mutex. + */ + +static DEFINE_MUTEX(ovs_mutex); + +void ovs_lock(void) +{ +	mutex_lock(&ovs_mutex); +} + +void ovs_unlock(void) +{ +	mutex_unlock(&ovs_mutex); +} + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void) +{ +	if (debug_locks) +		return lockdep_is_held(&ovs_mutex); +	else +		return 1; +} +#endif + +static struct vport *new_vport(const struct vport_parms *); +static int queue_gso_packets(struct datapath *dp, struct sk_buff *, +			     const struct dp_upcall_info *); +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, +				  const struct dp_upcall_info *); + +/* Must be called with rcu_read_lock or ovs_mutex. */ +static struct datapath *get_dp(struct net *net, int dp_ifindex) +{ +	struct datapath *dp = NULL; +	struct net_device *dev; + +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, dp_ifindex); +	if (dev) { +		struct vport *vport = ovs_internal_dev_get_vport(dev); +		if (vport) +			dp = vport->dp; +	} +	rcu_read_unlock(); + +	return dp; +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +static const char *ovs_dp_name(const struct datapath *dp) +{ +	struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); +	return vport->ops->get_name(vport); +} + +static int get_dpifindex(struct datapath *dp) +{ +	struct vport *local; +	int ifindex; + +	rcu_read_lock(); + +	local = ovs_vport_rcu(dp, OVSP_LOCAL); +	if (local) +		ifindex = netdev_vport_priv(local)->dev->ifindex; +	else +		ifindex = 0; + +	rcu_read_unlock(); + +	return ifindex; +} + +static void destroy_dp_rcu(struct rcu_head *rcu) +{ +	struct datapath *dp = container_of(rcu, struct datapath, rcu); + +	free_percpu(dp->stats_percpu); +	release_net(ovs_dp_get_net(dp)); +	kfree(dp->ports); +	kfree(dp); +} + +static struct hlist_head *vport_hash_bucket(const struct datapath *dp, +					    u16 port_no) +{ +	return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; +} + +/* Called with ovs_mutex or RCU read lock. */ +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) +{ +	struct vport *vport; +	struct hlist_head *head; + +	head = vport_hash_bucket(dp, port_no); +	hlist_for_each_entry_rcu(vport, head, dp_hash_node) { +		if (vport->port_no == port_no) +			return vport; +	} +	return NULL; +} + +/* Called with ovs_mutex. */ +static struct vport *new_vport(const struct vport_parms *parms) +{ +	struct vport *vport; + +	vport = ovs_vport_add(parms); +	if (!IS_ERR(vport)) { +		struct datapath *dp = parms->dp; +		struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); + +		hlist_add_head_rcu(&vport->dp_hash_node, head); +	} +	return vport; +} + +void ovs_dp_detach_port(struct vport *p) +{ +	ASSERT_OVSL(); + +	/* First drop references to device. */ +	hlist_del_rcu(&p->dp_hash_node); + +	/* Then destroy it. */ +	ovs_vport_del(p); +} + +/* Must be called with rcu_read_lock. */ +void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +{ +	struct datapath *dp = p->dp; +	struct sw_flow *flow; +	struct dp_stats_percpu *stats; +	struct sw_flow_key key; +	u64 *stats_counter; +	u32 n_mask_hit; +	int error; + +	stats = this_cpu_ptr(dp->stats_percpu); + +	/* Extract flow from 'skb' into 'key'. */ +	error = ovs_flow_extract(skb, p->port_no, &key); +	if (unlikely(error)) { +		kfree_skb(skb); +		return; +	} + +	/* Look up flow. */ +	flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit); +	if (unlikely(!flow)) { +		struct dp_upcall_info upcall; + +		upcall.cmd = OVS_PACKET_CMD_MISS; +		upcall.key = &key; +		upcall.userdata = NULL; +		upcall.portid = p->upcall_portid; +		ovs_dp_upcall(dp, skb, &upcall); +		consume_skb(skb); +		stats_counter = &stats->n_missed; +		goto out; +	} + +	OVS_CB(skb)->flow = flow; +	OVS_CB(skb)->pkt_key = &key; + +	ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); +	ovs_execute_actions(dp, skb); +	stats_counter = &stats->n_hit; + +out: +	/* Update datapath statistics. */ +	u64_stats_update_begin(&stats->syncp); +	(*stats_counter)++; +	stats->n_mask_hit += n_mask_hit; +	u64_stats_update_end(&stats->syncp); +} + +int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, +		  const struct dp_upcall_info *upcall_info) +{ +	struct dp_stats_percpu *stats; +	int err; + +	if (upcall_info->portid == 0) { +		err = -ENOTCONN; +		goto err; +	} + +	if (!skb_is_gso(skb)) +		err = queue_userspace_packet(dp, skb, upcall_info); +	else +		err = queue_gso_packets(dp, skb, upcall_info); +	if (err) +		goto err; + +	return 0; + +err: +	stats = this_cpu_ptr(dp->stats_percpu); + +	u64_stats_update_begin(&stats->syncp); +	stats->n_lost++; +	u64_stats_update_end(&stats->syncp); + +	return err; +} + +static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, +			     const struct dp_upcall_info *upcall_info) +{ +	unsigned short gso_type = skb_shinfo(skb)->gso_type; +	struct dp_upcall_info later_info; +	struct sw_flow_key later_key; +	struct sk_buff *segs, *nskb; +	int err; + +	segs = __skb_gso_segment(skb, NETIF_F_SG, false); +	if (IS_ERR(segs)) +		return PTR_ERR(segs); + +	/* Queue all of the segments. */ +	skb = segs; +	do { +		err = queue_userspace_packet(dp, skb, upcall_info); +		if (err) +			break; + +		if (skb == segs && gso_type & SKB_GSO_UDP) { +			/* The initial flow key extracted by ovs_flow_extract() +			 * in this case is for a first fragment, so we need to +			 * properly mark later fragments. +			 */ +			later_key = *upcall_info->key; +			later_key.ip.frag = OVS_FRAG_TYPE_LATER; + +			later_info = *upcall_info; +			later_info.key = &later_key; +			upcall_info = &later_info; +		} +	} while ((skb = skb->next)); + +	/* Free all of the segments. */ +	skb = segs; +	do { +		nskb = skb->next; +		if (err) +			kfree_skb(skb); +		else +			consume_skb(skb); +	} while ((skb = nskb)); +	return err; +} + +static size_t key_attr_size(void) +{ +	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */ +		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */ +		  + nla_total_size(8)   /* OVS_TUNNEL_KEY_ATTR_ID */ +		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ +		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ +		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TOS */ +		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */ +		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ +		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */ +		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */ +		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */ +		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */ +		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */ +		+ nla_total_size(4)   /* OVS_KEY_ATTR_8021Q */ +		+ nla_total_size(0)   /* OVS_KEY_ATTR_ENCAP */ +		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */ +		+ nla_total_size(40)  /* OVS_KEY_ATTR_IPV6 */ +		+ nla_total_size(2)   /* OVS_KEY_ATTR_ICMPV6 */ +		+ nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + +static size_t upcall_msg_size(const struct nlattr *userdata, +			      unsigned int hdrlen) +{ +	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) +		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ +		+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + +	/* OVS_PACKET_ATTR_USERDATA */ +	if (userdata) +		size += NLA_ALIGN(userdata->nla_len); + +	return size; +} + +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, +				  const struct dp_upcall_info *upcall_info) +{ +	struct ovs_header *upcall; +	struct sk_buff *nskb = NULL; +	struct sk_buff *user_skb; /* to be queued to userspace */ +	struct nlattr *nla; +	struct genl_info info = { +		.dst_sk = ovs_dp_get_net(dp)->genl_sock, +		.snd_portid = upcall_info->portid, +	}; +	size_t len; +	unsigned int hlen; +	int err, dp_ifindex; + +	dp_ifindex = get_dpifindex(dp); +	if (!dp_ifindex) +		return -ENODEV; + +	if (vlan_tx_tag_present(skb)) { +		nskb = skb_clone(skb, GFP_ATOMIC); +		if (!nskb) +			return -ENOMEM; + +		nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb)); +		if (!nskb) +			return -ENOMEM; + +		nskb->vlan_tci = 0; +		skb = nskb; +	} + +	if (nla_attr_size(skb->len) > USHRT_MAX) { +		err = -EFBIG; +		goto out; +	} + +	/* Complete checksum if needed */ +	if (skb->ip_summed == CHECKSUM_PARTIAL && +	    (err = skb_checksum_help(skb))) +		goto out; + +	/* Older versions of OVS user space enforce alignment of the last +	 * Netlink attribute to NLA_ALIGNTO which would require extensive +	 * padding logic. Only perform zerocopy if padding is not required. +	 */ +	if (dp->user_features & OVS_DP_F_UNALIGNED) +		hlen = skb_zerocopy_headlen(skb); +	else +		hlen = skb->len; + +	len = upcall_msg_size(upcall_info->userdata, hlen); +	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); +	if (!user_skb) { +		err = -ENOMEM; +		goto out; +	} + +	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, +			     0, upcall_info->cmd); +	upcall->dp_ifindex = dp_ifindex; + +	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); +	ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); +	nla_nest_end(user_skb, nla); + +	if (upcall_info->userdata) +		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, +			  nla_len(upcall_info->userdata), +			  nla_data(upcall_info->userdata)); + +	/* Only reserve room for attribute header, packet data is added +	 * in skb_zerocopy() */ +	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { +		err = -ENOBUFS; +		goto out; +	} +	nla->nla_len = nla_attr_size(skb->len); + +	err = skb_zerocopy(user_skb, skb, skb->len, hlen); +	if (err) +		goto out; + +	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ +	if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { +		size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; + +		if (plen > 0) +			memset(skb_put(user_skb, plen), 0, plen); +	} + +	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; + +	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); +out: +	if (err) +		skb_tx_error(skb); +	kfree_skb(nskb); +	return err; +} + +static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) +{ +	struct ovs_header *ovs_header = info->userhdr; +	struct nlattr **a = info->attrs; +	struct sw_flow_actions *acts; +	struct sk_buff *packet; +	struct sw_flow *flow; +	struct datapath *dp; +	struct ethhdr *eth; +	int len; +	int err; + +	err = -EINVAL; +	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || +	    !a[OVS_PACKET_ATTR_ACTIONS]) +		goto err; + +	len = nla_len(a[OVS_PACKET_ATTR_PACKET]); +	packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); +	err = -ENOMEM; +	if (!packet) +		goto err; +	skb_reserve(packet, NET_IP_ALIGN); + +	nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); + +	skb_reset_mac_header(packet); +	eth = eth_hdr(packet); + +	/* Normally, setting the skb 'protocol' field would be handled by a +	 * call to eth_type_trans(), but it assumes there's a sending +	 * device, which we may not have. */ +	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) +		packet->protocol = eth->h_proto; +	else +		packet->protocol = htons(ETH_P_802_2); + +	/* Build an sw_flow for sending this packet. */ +	flow = ovs_flow_alloc(); +	err = PTR_ERR(flow); +	if (IS_ERR(flow)) +		goto err_kfree_skb; + +	err = ovs_flow_extract(packet, -1, &flow->key); +	if (err) +		goto err_flow_free; + +	err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]); +	if (err) +		goto err_flow_free; +	acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); +	err = PTR_ERR(acts); +	if (IS_ERR(acts)) +		goto err_flow_free; + +	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], +				   &flow->key, 0, &acts); +	rcu_assign_pointer(flow->sf_acts, acts); +	if (err) +		goto err_flow_free; + +	OVS_CB(packet)->flow = flow; +	OVS_CB(packet)->pkt_key = &flow->key; +	packet->priority = flow->key.phy.priority; +	packet->mark = flow->key.phy.skb_mark; + +	rcu_read_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	err = -ENODEV; +	if (!dp) +		goto err_unlock; + +	local_bh_disable(); +	err = ovs_execute_actions(dp, packet); +	local_bh_enable(); +	rcu_read_unlock(); + +	ovs_flow_free(flow, false); +	return err; + +err_unlock: +	rcu_read_unlock(); +err_flow_free: +	ovs_flow_free(flow, false); +err_kfree_skb: +	kfree_skb(packet); +err: +	return err; +} + +static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { +	[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, +	[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, +	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, +}; + +static const struct genl_ops dp_packet_genl_ops[] = { +	{ .cmd = OVS_PACKET_CMD_EXECUTE, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = packet_policy, +	  .doit = ovs_packet_cmd_execute +	} +}; + +static struct genl_family dp_packet_genl_family = { +	.id = GENL_ID_GENERATE, +	.hdrsize = sizeof(struct ovs_header), +	.name = OVS_PACKET_FAMILY, +	.version = OVS_PACKET_VERSION, +	.maxattr = OVS_PACKET_ATTR_MAX, +	.netnsok = true, +	.parallel_ops = true, +	.ops = dp_packet_genl_ops, +	.n_ops = ARRAY_SIZE(dp_packet_genl_ops), +}; + +static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, +			 struct ovs_dp_megaflow_stats *mega_stats) +{ +	int i; + +	memset(mega_stats, 0, sizeof(*mega_stats)); + +	stats->n_flows = ovs_flow_tbl_count(&dp->table); +	mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); + +	stats->n_hit = stats->n_missed = stats->n_lost = 0; + +	for_each_possible_cpu(i) { +		const struct dp_stats_percpu *percpu_stats; +		struct dp_stats_percpu local_stats; +		unsigned int start; + +		percpu_stats = per_cpu_ptr(dp->stats_percpu, i); + +		do { +			start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); +			local_stats = *percpu_stats; +		} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + +		stats->n_hit += local_stats.n_hit; +		stats->n_missed += local_stats.n_missed; +		stats->n_lost += local_stats.n_lost; +		mega_stats->n_mask_hit += local_stats.n_mask_hit; +	} +} + +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) +{ +	return NLMSG_ALIGN(sizeof(struct ovs_header)) +		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ +		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ +		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ +		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ +		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */ +		+ nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, +				  struct sk_buff *skb, u32 portid, +				  u32 seq, u32 flags, u8 cmd) +{ +	const int skb_orig_len = skb->len; +	struct nlattr *start; +	struct ovs_flow_stats stats; +	__be16 tcp_flags; +	unsigned long used; +	struct ovs_header *ovs_header; +	struct nlattr *nla; +	int err; + +	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); +	if (!ovs_header) +		return -EMSGSIZE; + +	ovs_header->dp_ifindex = dp_ifindex; + +	/* Fill flow key. */ +	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); +	if (!nla) +		goto nla_put_failure; + +	err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); +	if (err) +		goto error; +	nla_nest_end(skb, nla); + +	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); +	if (!nla) +		goto nla_put_failure; + +	err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); +	if (err) +		goto error; + +	nla_nest_end(skb, nla); + +	ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); + +	if (used && +	    nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) +		goto nla_put_failure; + +	if (stats.n_packets && +	    nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) +		goto nla_put_failure; + +	if ((u8)ntohs(tcp_flags) && +	     nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) +		goto nla_put_failure; + +	/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if +	 * this is the first flow to be dumped into 'skb'.  This is unusual for +	 * Netlink but individual action lists can be longer than +	 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. +	 * The userspace caller can always fetch the actions separately if it +	 * really wants them.  (Most userspace callers in fact don't care.) +	 * +	 * This can only fail for dump operations because the skb is always +	 * properly sized for single flows. +	 */ +	start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); +	if (start) { +		const struct sw_flow_actions *sf_acts; + +		sf_acts = rcu_dereference_ovsl(flow->sf_acts); +		err = ovs_nla_put_actions(sf_acts->actions, +					  sf_acts->actions_len, skb); + +		if (!err) +			nla_nest_end(skb, start); +		else { +			if (skb_orig_len) +				goto error; + +			nla_nest_cancel(skb, start); +		} +	} else if (skb_orig_len) +		goto nla_put_failure; + +	return genlmsg_end(skb, ovs_header); + +nla_put_failure: +	err = -EMSGSIZE; +error: +	genlmsg_cancel(skb, ovs_header); +	return err; +} + +/* May not be called with RCU read lock. */ +static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, +					       struct genl_info *info, +					       bool always) +{ +	struct sk_buff *skb; + +	if (!always && !ovs_must_notify(info, &ovs_dp_flow_multicast_group)) +		return NULL; + +	skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL); +	if (!skb) +		return ERR_PTR(-ENOMEM); + +	return skb; +} + +/* Called with ovs_mutex. */ +static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, +					       int dp_ifindex, +					       struct genl_info *info, u8 cmd, +					       bool always) +{ +	struct sk_buff *skb; +	int retval; + +	skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info, +				      always); +	if (!skb || IS_ERR(skb)) +		return skb; + +	retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, +					info->snd_portid, info->snd_seq, 0, +					cmd); +	BUG_ON(retval < 0); +	return skb; +} + +static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct ovs_header *ovs_header = info->userhdr; +	struct sw_flow *flow, *new_flow; +	struct sw_flow_mask mask; +	struct sk_buff *reply; +	struct datapath *dp; +	struct sw_flow_actions *acts; +	struct sw_flow_match match; +	int error; + +	/* Must have key and actions. */ +	error = -EINVAL; +	if (!a[OVS_FLOW_ATTR_KEY]) +		goto error; +	if (!a[OVS_FLOW_ATTR_ACTIONS]) +		goto error; + +	/* Most of the time we need to allocate a new flow, do it before +	 * locking. +	 */ +	new_flow = ovs_flow_alloc(); +	if (IS_ERR(new_flow)) { +		error = PTR_ERR(new_flow); +		goto error; +	} + +	/* Extract key. */ +	ovs_match_init(&match, &new_flow->unmasked_key, &mask); +	error = ovs_nla_get_match(&match, +				  a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); +	if (error) +		goto err_kfree_flow; + +	ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); + +	/* Validate actions. */ +	acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); +	error = PTR_ERR(acts); +	if (IS_ERR(acts)) +		goto err_kfree_flow; + +	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, +				     0, &acts); +	if (error) { +		OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); +		goto err_kfree_acts; +	} + +	reply = ovs_flow_cmd_alloc_info(acts, info, false); +	if (IS_ERR(reply)) { +		error = PTR_ERR(reply); +		goto err_kfree_acts; +	} + +	ovs_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	if (unlikely(!dp)) { +		error = -ENODEV; +		goto err_unlock_ovs; +	} +	/* Check if this is a duplicate flow */ +	flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key); +	if (likely(!flow)) { +		rcu_assign_pointer(new_flow->sf_acts, acts); + +		/* Put flow in bucket. */ +		error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); +		if (unlikely(error)) { +			acts = NULL; +			goto err_unlock_ovs; +		} + +		if (unlikely(reply)) { +			error = ovs_flow_cmd_fill_info(new_flow, +						       ovs_header->dp_ifindex, +						       reply, info->snd_portid, +						       info->snd_seq, 0, +						       OVS_FLOW_CMD_NEW); +			BUG_ON(error < 0); +		} +		ovs_unlock(); +	} else { +		struct sw_flow_actions *old_acts; + +		/* Bail out if we're not allowed to modify an existing flow. +		 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL +		 * because Generic Netlink treats the latter as a dump +		 * request.  We also accept NLM_F_EXCL in case that bug ever +		 * gets fixed. +		 */ +		if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE +							 | NLM_F_EXCL))) { +			error = -EEXIST; +			goto err_unlock_ovs; +		} +		/* The unmasked key has to be the same for flow updates. */ +		if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { +			flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); +			if (!flow) { +				error = -ENOENT; +				goto err_unlock_ovs; +			} +		} +		/* Update actions. */ +		old_acts = ovsl_dereference(flow->sf_acts); +		rcu_assign_pointer(flow->sf_acts, acts); + +		if (unlikely(reply)) { +			error = ovs_flow_cmd_fill_info(flow, +						       ovs_header->dp_ifindex, +						       reply, info->snd_portid, +						       info->snd_seq, 0, +						       OVS_FLOW_CMD_NEW); +			BUG_ON(error < 0); +		} +		ovs_unlock(); + +		ovs_nla_free_flow_actions(old_acts); +		ovs_flow_free(new_flow, false); +	} + +	if (reply) +		ovs_notify(&dp_flow_genl_family, reply, info); +	return 0; + +err_unlock_ovs: +	ovs_unlock(); +	kfree_skb(reply); +err_kfree_acts: +	kfree(acts); +err_kfree_flow: +	ovs_flow_free(new_flow, false); +error: +	return error; +} + +static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct ovs_header *ovs_header = info->userhdr; +	struct sw_flow_key key, masked_key; +	struct sw_flow *flow; +	struct sw_flow_mask mask; +	struct sk_buff *reply = NULL; +	struct datapath *dp; +	struct sw_flow_actions *old_acts = NULL, *acts = NULL; +	struct sw_flow_match match; +	int error; + +	/* Extract key. */ +	error = -EINVAL; +	if (!a[OVS_FLOW_ATTR_KEY]) +		goto error; + +	ovs_match_init(&match, &key, &mask); +	error = ovs_nla_get_match(&match, +				  a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); +	if (error) +		goto error; + +	/* Validate actions. */ +	if (a[OVS_FLOW_ATTR_ACTIONS]) { +		acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); +		error = PTR_ERR(acts); +		if (IS_ERR(acts)) +			goto error; + +		ovs_flow_mask_key(&masked_key, &key, &mask); +		error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], +					     &masked_key, 0, &acts); +		if (error) { +			OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); +			goto err_kfree_acts; +		} +	} + +	/* Can allocate before locking if have acts. */ +	if (acts) { +		reply = ovs_flow_cmd_alloc_info(acts, info, false); +		if (IS_ERR(reply)) { +			error = PTR_ERR(reply); +			goto err_kfree_acts; +		} +	} + +	ovs_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	if (unlikely(!dp)) { +		error = -ENODEV; +		goto err_unlock_ovs; +	} +	/* Check that the flow exists. */ +	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); +	if (unlikely(!flow)) { +		error = -ENOENT; +		goto err_unlock_ovs; +	} + +	/* Update actions, if present. */ +	if (likely(acts)) { +		old_acts = ovsl_dereference(flow->sf_acts); +		rcu_assign_pointer(flow->sf_acts, acts); + +		if (unlikely(reply)) { +			error = ovs_flow_cmd_fill_info(flow, +						       ovs_header->dp_ifindex, +						       reply, info->snd_portid, +						       info->snd_seq, 0, +						       OVS_FLOW_CMD_NEW); +			BUG_ON(error < 0); +		} +	} else { +		/* Could not alloc without acts before locking. */ +		reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, +						info, OVS_FLOW_CMD_NEW, false); +		if (unlikely(IS_ERR(reply))) { +			error = PTR_ERR(reply); +			goto err_unlock_ovs; +		} +	} + +	/* Clear stats. */ +	if (a[OVS_FLOW_ATTR_CLEAR]) +		ovs_flow_stats_clear(flow); +	ovs_unlock(); + +	if (reply) +		ovs_notify(&dp_flow_genl_family, reply, info); +	if (old_acts) +		ovs_nla_free_flow_actions(old_acts); + +	return 0; + +err_unlock_ovs: +	ovs_unlock(); +	kfree_skb(reply); +err_kfree_acts: +	kfree(acts); +error: +	return error; +} + +static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct ovs_header *ovs_header = info->userhdr; +	struct sw_flow_key key; +	struct sk_buff *reply; +	struct sw_flow *flow; +	struct datapath *dp; +	struct sw_flow_match match; +	int err; + +	if (!a[OVS_FLOW_ATTR_KEY]) { +		OVS_NLERR("Flow get message rejected, Key attribute missing.\n"); +		return -EINVAL; +	} + +	ovs_match_init(&match, &key, NULL); +	err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); +	if (err) +		return err; + +	ovs_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	if (!dp) { +		err = -ENODEV; +		goto unlock; +	} + +	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); +	if (!flow) { +		err = -ENOENT; +		goto unlock; +	} + +	reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, +					OVS_FLOW_CMD_NEW, true); +	if (IS_ERR(reply)) { +		err = PTR_ERR(reply); +		goto unlock; +	} + +	ovs_unlock(); +	return genlmsg_reply(reply, info); +unlock: +	ovs_unlock(); +	return err; +} + +static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct ovs_header *ovs_header = info->userhdr; +	struct sw_flow_key key; +	struct sk_buff *reply; +	struct sw_flow *flow; +	struct datapath *dp; +	struct sw_flow_match match; +	int err; + +	if (likely(a[OVS_FLOW_ATTR_KEY])) { +		ovs_match_init(&match, &key, NULL); +		err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); +		if (unlikely(err)) +			return err; +	} + +	ovs_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	if (unlikely(!dp)) { +		err = -ENODEV; +		goto unlock; +	} + +	if (unlikely(!a[OVS_FLOW_ATTR_KEY])) { +		err = ovs_flow_tbl_flush(&dp->table); +		goto unlock; +	} + +	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); +	if (unlikely(!flow)) { +		err = -ENOENT; +		goto unlock; +	} + +	ovs_flow_tbl_remove(&dp->table, flow); +	ovs_unlock(); + +	reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, +					info, false); +	if (likely(reply)) { +		if (likely(!IS_ERR(reply))) { +			rcu_read_lock();	/*To keep RCU checker happy. */ +			err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, +						     reply, info->snd_portid, +						     info->snd_seq, 0, +						     OVS_FLOW_CMD_DEL); +			rcu_read_unlock(); +			BUG_ON(err < 0); + +			ovs_notify(&dp_flow_genl_family, reply, info); +		} else { +			netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); +		} +	} + +	ovs_flow_free(flow, true); +	return 0; +unlock: +	ovs_unlock(); +	return err; +} + +static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); +	struct table_instance *ti; +	struct datapath *dp; + +	rcu_read_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	if (!dp) { +		rcu_read_unlock(); +		return -ENODEV; +	} + +	ti = rcu_dereference(dp->table.ti); +	for (;;) { +		struct sw_flow *flow; +		u32 bucket, obj; + +		bucket = cb->args[0]; +		obj = cb->args[1]; +		flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); +		if (!flow) +			break; + +		if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, +					   NETLINK_CB(cb->skb).portid, +					   cb->nlh->nlmsg_seq, NLM_F_MULTI, +					   OVS_FLOW_CMD_NEW) < 0) +			break; + +		cb->args[0] = bucket; +		cb->args[1] = obj; +	} +	rcu_read_unlock(); +	return skb->len; +} + +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { +	[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, +	[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, +	[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, +}; + +static struct genl_ops dp_flow_genl_ops[] = { +	{ .cmd = OVS_FLOW_CMD_NEW, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = flow_policy, +	  .doit = ovs_flow_cmd_new +	}, +	{ .cmd = OVS_FLOW_CMD_DEL, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = flow_policy, +	  .doit = ovs_flow_cmd_del +	}, +	{ .cmd = OVS_FLOW_CMD_GET, +	  .flags = 0,		    /* OK for unprivileged users. */ +	  .policy = flow_policy, +	  .doit = ovs_flow_cmd_get, +	  .dumpit = ovs_flow_cmd_dump +	}, +	{ .cmd = OVS_FLOW_CMD_SET, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = flow_policy, +	  .doit = ovs_flow_cmd_set, +	}, +}; + +static struct genl_family dp_flow_genl_family = { +	.id = GENL_ID_GENERATE, +	.hdrsize = sizeof(struct ovs_header), +	.name = OVS_FLOW_FAMILY, +	.version = OVS_FLOW_VERSION, +	.maxattr = OVS_FLOW_ATTR_MAX, +	.netnsok = true, +	.parallel_ops = true, +	.ops = dp_flow_genl_ops, +	.n_ops = ARRAY_SIZE(dp_flow_genl_ops), +	.mcgrps = &ovs_dp_flow_multicast_group, +	.n_mcgrps = 1, +}; + +static size_t ovs_dp_cmd_msg_size(void) +{ +	size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); + +	msgsize += nla_total_size(IFNAMSIZ); +	msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); +	msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); +	msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + +	return msgsize; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, +				u32 portid, u32 seq, u32 flags, u8 cmd) +{ +	struct ovs_header *ovs_header; +	struct ovs_dp_stats dp_stats; +	struct ovs_dp_megaflow_stats dp_megaflow_stats; +	int err; + +	ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, +				   flags, cmd); +	if (!ovs_header) +		goto error; + +	ovs_header->dp_ifindex = get_dpifindex(dp); + +	err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); +	if (err) +		goto nla_put_failure; + +	get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); +	if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), +			&dp_stats)) +		goto nla_put_failure; + +	if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, +			sizeof(struct ovs_dp_megaflow_stats), +			&dp_megaflow_stats)) +		goto nla_put_failure; + +	if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) +		goto nla_put_failure; + +	return genlmsg_end(skb, ovs_header); + +nla_put_failure: +	genlmsg_cancel(skb, ovs_header); +error: +	return -EMSGSIZE; +} + +static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +{ +	return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); +} + +/* Called with rcu_read_lock or ovs_mutex. */ +static struct datapath *lookup_datapath(struct net *net, +					struct ovs_header *ovs_header, +					struct nlattr *a[OVS_DP_ATTR_MAX + 1]) +{ +	struct datapath *dp; + +	if (!a[OVS_DP_ATTR_NAME]) +		dp = get_dp(net, ovs_header->dp_ifindex); +	else { +		struct vport *vport; + +		vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); +		dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; +	} +	return dp ? dp : ERR_PTR(-ENODEV); +} + +static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +{ +	struct datapath *dp; + +	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); +	if (IS_ERR(dp)) +		return; + +	WARN(dp->user_features, "Dropping previously announced user features\n"); +	dp->user_features = 0; +} + +static void ovs_dp_change(struct datapath *dp, struct nlattr **a) +{ +	if (a[OVS_DP_ATTR_USER_FEATURES]) +		dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); +} + +static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct vport_parms parms; +	struct sk_buff *reply; +	struct datapath *dp; +	struct vport *vport; +	struct ovs_net *ovs_net; +	int err, i; + +	err = -EINVAL; +	if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) +		goto err; + +	reply = ovs_dp_cmd_alloc_info(info); +	if (!reply) +		return -ENOMEM; + +	err = -ENOMEM; +	dp = kzalloc(sizeof(*dp), GFP_KERNEL); +	if (dp == NULL) +		goto err_free_reply; + +	ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); + +	/* Allocate table. */ +	err = ovs_flow_tbl_init(&dp->table); +	if (err) +		goto err_free_dp; + +	dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); +	if (!dp->stats_percpu) { +		err = -ENOMEM; +		goto err_destroy_table; +	} + +	dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), +			    GFP_KERNEL); +	if (!dp->ports) { +		err = -ENOMEM; +		goto err_destroy_percpu; +	} + +	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) +		INIT_HLIST_HEAD(&dp->ports[i]); + +	/* Set up our datapath device. */ +	parms.name = nla_data(a[OVS_DP_ATTR_NAME]); +	parms.type = OVS_VPORT_TYPE_INTERNAL; +	parms.options = NULL; +	parms.dp = dp; +	parms.port_no = OVSP_LOCAL; +	parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + +	ovs_dp_change(dp, a); + +	/* So far only local changes have been made, now need the lock. */ +	ovs_lock(); + +	vport = new_vport(&parms); +	if (IS_ERR(vport)) { +		err = PTR_ERR(vport); +		if (err == -EBUSY) +			err = -EEXIST; + +		if (err == -EEXIST) { +			/* An outdated user space instance that does not understand +			 * the concept of user_features has attempted to create a new +			 * datapath and is likely to reuse it. Drop all user features. +			 */ +			if (info->genlhdr->version < OVS_DP_VER_FEATURES) +				ovs_dp_reset_user_features(skb, info); +		} + +		goto err_destroy_ports_array; +	} + +	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, +				   info->snd_seq, 0, OVS_DP_CMD_NEW); +	BUG_ON(err < 0); + +	ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); +	list_add_tail_rcu(&dp->list_node, &ovs_net->dps); + +	ovs_unlock(); + +	ovs_notify(&dp_datapath_genl_family, reply, info); +	return 0; + +err_destroy_ports_array: +	ovs_unlock(); +	kfree(dp->ports); +err_destroy_percpu: +	free_percpu(dp->stats_percpu); +err_destroy_table: +	ovs_flow_tbl_destroy(&dp->table, false); +err_free_dp: +	release_net(ovs_dp_get_net(dp)); +	kfree(dp); +err_free_reply: +	kfree_skb(reply); +err: +	return err; +} + +/* Called with ovs_mutex. */ +static void __dp_destroy(struct datapath *dp) +{ +	int i; + +	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { +		struct vport *vport; +		struct hlist_node *n; + +		hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) +			if (vport->port_no != OVSP_LOCAL) +				ovs_dp_detach_port(vport); +	} + +	list_del_rcu(&dp->list_node); + +	/* OVSP_LOCAL is datapath internal port. We need to make sure that +	 * all ports in datapath are destroyed first before freeing datapath. +	 */ +	ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + +	/* RCU destroy the flow table */ +	ovs_flow_tbl_destroy(&dp->table, true); + +	call_rcu(&dp->rcu, destroy_dp_rcu); +} + +static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ +	struct sk_buff *reply; +	struct datapath *dp; +	int err; + +	reply = ovs_dp_cmd_alloc_info(info); +	if (!reply) +		return -ENOMEM; + +	ovs_lock(); +	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); +	err = PTR_ERR(dp); +	if (IS_ERR(dp)) +		goto err_unlock_free; + +	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, +				   info->snd_seq, 0, OVS_DP_CMD_DEL); +	BUG_ON(err < 0); + +	__dp_destroy(dp); +	ovs_unlock(); + +	ovs_notify(&dp_datapath_genl_family, reply, info); + +	return 0; + +err_unlock_free: +	ovs_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ +	struct sk_buff *reply; +	struct datapath *dp; +	int err; + +	reply = ovs_dp_cmd_alloc_info(info); +	if (!reply) +		return -ENOMEM; + +	ovs_lock(); +	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); +	err = PTR_ERR(dp); +	if (IS_ERR(dp)) +		goto err_unlock_free; + +	ovs_dp_change(dp, info->attrs); + +	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, +				   info->snd_seq, 0, OVS_DP_CMD_NEW); +	BUG_ON(err < 0); + +	ovs_unlock(); +	ovs_notify(&dp_datapath_genl_family, reply, info); + +	return 0; + +err_unlock_free: +	ovs_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ +	struct sk_buff *reply; +	struct datapath *dp; +	int err; + +	reply = ovs_dp_cmd_alloc_info(info); +	if (!reply) +		return -ENOMEM; + +	rcu_read_lock(); +	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); +	if (IS_ERR(dp)) { +		err = PTR_ERR(dp); +		goto err_unlock_free; +	} +	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, +				   info->snd_seq, 0, OVS_DP_CMD_NEW); +	BUG_ON(err < 0); +	rcu_read_unlock(); + +	return genlmsg_reply(reply, info); + +err_unlock_free: +	rcu_read_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); +	struct datapath *dp; +	int skip = cb->args[0]; +	int i = 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) { +		if (i >= skip && +		    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, +					 cb->nlh->nlmsg_seq, NLM_F_MULTI, +					 OVS_DP_CMD_NEW) < 0) +			break; +		i++; +	} +	rcu_read_unlock(); + +	cb->args[0] = i; + +	return skb->len; +} + +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { +	[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, +	[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, +	[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, +}; + +static struct genl_ops dp_datapath_genl_ops[] = { +	{ .cmd = OVS_DP_CMD_NEW, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = datapath_policy, +	  .doit = ovs_dp_cmd_new +	}, +	{ .cmd = OVS_DP_CMD_DEL, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = datapath_policy, +	  .doit = ovs_dp_cmd_del +	}, +	{ .cmd = OVS_DP_CMD_GET, +	  .flags = 0,		    /* OK for unprivileged users. */ +	  .policy = datapath_policy, +	  .doit = ovs_dp_cmd_get, +	  .dumpit = ovs_dp_cmd_dump +	}, +	{ .cmd = OVS_DP_CMD_SET, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = datapath_policy, +	  .doit = ovs_dp_cmd_set, +	}, +}; + +static struct genl_family dp_datapath_genl_family = { +	.id = GENL_ID_GENERATE, +	.hdrsize = sizeof(struct ovs_header), +	.name = OVS_DATAPATH_FAMILY, +	.version = OVS_DATAPATH_VERSION, +	.maxattr = OVS_DP_ATTR_MAX, +	.netnsok = true, +	.parallel_ops = true, +	.ops = dp_datapath_genl_ops, +	.n_ops = ARRAY_SIZE(dp_datapath_genl_ops), +	.mcgrps = &ovs_dp_datapath_multicast_group, +	.n_mcgrps = 1, +}; + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, +				   u32 portid, u32 seq, u32 flags, u8 cmd) +{ +	struct ovs_header *ovs_header; +	struct ovs_vport_stats vport_stats; +	int err; + +	ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, +				 flags, cmd); +	if (!ovs_header) +		return -EMSGSIZE; + +	ovs_header->dp_ifindex = get_dpifindex(vport->dp); + +	if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || +	    nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || +	    nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) || +	    nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid)) +		goto nla_put_failure; + +	ovs_vport_get_stats(vport, &vport_stats); +	if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), +		    &vport_stats)) +		goto nla_put_failure; + +	err = ovs_vport_get_options(vport, skb); +	if (err == -EMSGSIZE) +		goto error; + +	return genlmsg_end(skb, ovs_header); + +nla_put_failure: +	err = -EMSGSIZE; +error: +	genlmsg_cancel(skb, ovs_header); +	return err; +} + +static struct sk_buff *ovs_vport_cmd_alloc_info(void) +{ +	return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +} + +/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, +					 u32 seq, u8 cmd) +{ +	struct sk_buff *skb; +	int retval; + +	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); +	if (!skb) +		return ERR_PTR(-ENOMEM); + +	retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); +	BUG_ON(retval < 0); + +	return skb; +} + +/* Called with ovs_mutex or RCU read lock. */ +static struct vport *lookup_vport(struct net *net, +				  struct ovs_header *ovs_header, +				  struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) +{ +	struct datapath *dp; +	struct vport *vport; + +	if (a[OVS_VPORT_ATTR_NAME]) { +		vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); +		if (!vport) +			return ERR_PTR(-ENODEV); +		if (ovs_header->dp_ifindex && +		    ovs_header->dp_ifindex != get_dpifindex(vport->dp)) +			return ERR_PTR(-ENODEV); +		return vport; +	} else if (a[OVS_VPORT_ATTR_PORT_NO]) { +		u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + +		if (port_no >= DP_MAX_PORTS) +			return ERR_PTR(-EFBIG); + +		dp = get_dp(net, ovs_header->dp_ifindex); +		if (!dp) +			return ERR_PTR(-ENODEV); + +		vport = ovs_vport_ovsl_rcu(dp, port_no); +		if (!vport) +			return ERR_PTR(-ENODEV); +		return vport; +	} else +		return ERR_PTR(-EINVAL); +} + +static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct ovs_header *ovs_header = info->userhdr; +	struct vport_parms parms; +	struct sk_buff *reply; +	struct vport *vport; +	struct datapath *dp; +	u32 port_no; +	int err; + +	if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || +	    !a[OVS_VPORT_ATTR_UPCALL_PID]) +		return -EINVAL; + +	port_no = a[OVS_VPORT_ATTR_PORT_NO] +		? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; +	if (port_no >= DP_MAX_PORTS) +		return -EFBIG; + +	reply = ovs_vport_cmd_alloc_info(); +	if (!reply) +		return -ENOMEM; + +	ovs_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	err = -ENODEV; +	if (!dp) +		goto exit_unlock_free; + +	if (port_no) { +		vport = ovs_vport_ovsl(dp, port_no); +		err = -EBUSY; +		if (vport) +			goto exit_unlock_free; +	} else { +		for (port_no = 1; ; port_no++) { +			if (port_no >= DP_MAX_PORTS) { +				err = -EFBIG; +				goto exit_unlock_free; +			} +			vport = ovs_vport_ovsl(dp, port_no); +			if (!vport) +				break; +		} +	} + +	parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); +	parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); +	parms.options = a[OVS_VPORT_ATTR_OPTIONS]; +	parms.dp = dp; +	parms.port_no = port_no; +	parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + +	vport = new_vport(&parms); +	err = PTR_ERR(vport); +	if (IS_ERR(vport)) +		goto exit_unlock_free; + +	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, +				      info->snd_seq, 0, OVS_VPORT_CMD_NEW); +	BUG_ON(err < 0); +	ovs_unlock(); + +	ovs_notify(&dp_vport_genl_family, reply, info); +	return 0; + +exit_unlock_free: +	ovs_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct sk_buff *reply; +	struct vport *vport; +	int err; + +	reply = ovs_vport_cmd_alloc_info(); +	if (!reply) +		return -ENOMEM; + +	ovs_lock(); +	vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); +	err = PTR_ERR(vport); +	if (IS_ERR(vport)) +		goto exit_unlock_free; + +	if (a[OVS_VPORT_ATTR_TYPE] && +	    nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { +		err = -EINVAL; +		goto exit_unlock_free; +	} + +	if (a[OVS_VPORT_ATTR_OPTIONS]) { +		err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); +		if (err) +			goto exit_unlock_free; +	} + +	if (a[OVS_VPORT_ATTR_UPCALL_PID]) +		vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + +	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, +				      info->snd_seq, 0, OVS_VPORT_CMD_NEW); +	BUG_ON(err < 0); + +	ovs_unlock(); +	ovs_notify(&dp_vport_genl_family, reply, info); +	return 0; + +exit_unlock_free: +	ovs_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct sk_buff *reply; +	struct vport *vport; +	int err; + +	reply = ovs_vport_cmd_alloc_info(); +	if (!reply) +		return -ENOMEM; + +	ovs_lock(); +	vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); +	err = PTR_ERR(vport); +	if (IS_ERR(vport)) +		goto exit_unlock_free; + +	if (vport->port_no == OVSP_LOCAL) { +		err = -EINVAL; +		goto exit_unlock_free; +	} + +	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, +				      info->snd_seq, 0, OVS_VPORT_CMD_DEL); +	BUG_ON(err < 0); +	ovs_dp_detach_port(vport); +	ovs_unlock(); + +	ovs_notify(&dp_vport_genl_family, reply, info); +	return 0; + +exit_unlock_free: +	ovs_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ +	struct nlattr **a = info->attrs; +	struct ovs_header *ovs_header = info->userhdr; +	struct sk_buff *reply; +	struct vport *vport; +	int err; + +	reply = ovs_vport_cmd_alloc_info(); +	if (!reply) +		return -ENOMEM; + +	rcu_read_lock(); +	vport = lookup_vport(sock_net(skb->sk), ovs_header, a); +	err = PTR_ERR(vport); +	if (IS_ERR(vport)) +		goto exit_unlock_free; +	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, +				      info->snd_seq, 0, OVS_VPORT_CMD_NEW); +	BUG_ON(err < 0); +	rcu_read_unlock(); + +	return genlmsg_reply(reply, info); + +exit_unlock_free: +	rcu_read_unlock(); +	kfree_skb(reply); +	return err; +} + +static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); +	struct datapath *dp; +	int bucket = cb->args[0], skip = cb->args[1]; +	int i, j = 0; + +	rcu_read_lock(); +	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); +	if (!dp) { +		rcu_read_unlock(); +		return -ENODEV; +	} +	for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { +		struct vport *vport; + +		j = 0; +		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { +			if (j >= skip && +			    ovs_vport_cmd_fill_info(vport, skb, +						    NETLINK_CB(cb->skb).portid, +						    cb->nlh->nlmsg_seq, +						    NLM_F_MULTI, +						    OVS_VPORT_CMD_NEW) < 0) +				goto out; + +			j++; +		} +		skip = 0; +	} +out: +	rcu_read_unlock(); + +	cb->args[0] = i; +	cb->args[1] = j; + +	return skb->len; +} + +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { +	[OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, +	[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, +	[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, +	[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, +	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, +	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + +static struct genl_ops dp_vport_genl_ops[] = { +	{ .cmd = OVS_VPORT_CMD_NEW, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = vport_policy, +	  .doit = ovs_vport_cmd_new +	}, +	{ .cmd = OVS_VPORT_CMD_DEL, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = vport_policy, +	  .doit = ovs_vport_cmd_del +	}, +	{ .cmd = OVS_VPORT_CMD_GET, +	  .flags = 0,		    /* OK for unprivileged users. */ +	  .policy = vport_policy, +	  .doit = ovs_vport_cmd_get, +	  .dumpit = ovs_vport_cmd_dump +	}, +	{ .cmd = OVS_VPORT_CMD_SET, +	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ +	  .policy = vport_policy, +	  .doit = ovs_vport_cmd_set, +	}, +}; + +struct genl_family dp_vport_genl_family = { +	.id = GENL_ID_GENERATE, +	.hdrsize = sizeof(struct ovs_header), +	.name = OVS_VPORT_FAMILY, +	.version = OVS_VPORT_VERSION, +	.maxattr = OVS_VPORT_ATTR_MAX, +	.netnsok = true, +	.parallel_ops = true, +	.ops = dp_vport_genl_ops, +	.n_ops = ARRAY_SIZE(dp_vport_genl_ops), +	.mcgrps = &ovs_dp_vport_multicast_group, +	.n_mcgrps = 1, +}; + +static struct genl_family * const dp_genl_families[] = { +	&dp_datapath_genl_family, +	&dp_vport_genl_family, +	&dp_flow_genl_family, +	&dp_packet_genl_family, +}; + +static void dp_unregister_genl(int n_families) +{ +	int i; + +	for (i = 0; i < n_families; i++) +		genl_unregister_family(dp_genl_families[i]); +} + +static int dp_register_genl(void) +{ +	int err; +	int i; + +	for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { + +		err = genl_register_family(dp_genl_families[i]); +		if (err) +			goto error; +	} + +	return 0; + +error: +	dp_unregister_genl(i); +	return err; +} + +static int __net_init ovs_init_net(struct net *net) +{ +	struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + +	INIT_LIST_HEAD(&ovs_net->dps); +	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); +	return 0; +} + +static void __net_exit ovs_exit_net(struct net *net) +{ +	struct datapath *dp, *dp_next; +	struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + +	ovs_lock(); +	list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) +		__dp_destroy(dp); +	ovs_unlock(); + +	cancel_work_sync(&ovs_net->dp_notify_work); +} + +static struct pernet_operations ovs_net_ops = { +	.init = ovs_init_net, +	.exit = ovs_exit_net, +	.id   = &ovs_net_id, +	.size = sizeof(struct ovs_net), +}; + +static int __init dp_init(void) +{ +	int err; + +	BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + +	pr_info("Open vSwitch switching datapath\n"); + +	err = ovs_flow_init(); +	if (err) +		goto error; + +	err = ovs_vport_init(); +	if (err) +		goto error_flow_exit; + +	err = register_pernet_device(&ovs_net_ops); +	if (err) +		goto error_vport_exit; + +	err = register_netdevice_notifier(&ovs_dp_device_notifier); +	if (err) +		goto error_netns_exit; + +	err = dp_register_genl(); +	if (err < 0) +		goto error_unreg_notifier; + +	return 0; + +error_unreg_notifier: +	unregister_netdevice_notifier(&ovs_dp_device_notifier); +error_netns_exit: +	unregister_pernet_device(&ovs_net_ops); +error_vport_exit: +	ovs_vport_exit(); +error_flow_exit: +	ovs_flow_exit(); +error: +	return err; +} + +static void dp_cleanup(void) +{ +	dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); +	unregister_netdevice_notifier(&ovs_dp_device_notifier); +	unregister_pernet_device(&ovs_net_ops); +	rcu_barrier(); +	ovs_vport_exit(); +	ovs_flow_exit(); +} + +module_init(dp_init); +module_exit(dp_cleanup); + +MODULE_DESCRIPTION("Open vSwitch switching datapath"); +MODULE_LICENSE("GPL"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h new file mode 100644 index 00000000000..7ede507500d --- /dev/null +++ b/net/openvswitch/datapath.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef DATAPATH_H +#define DATAPATH_H 1 + +#include <asm/page.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/u64_stats_sync.h> + +#include "flow.h" +#include "flow_table.h" +#include "vport.h" + +#define DP_MAX_PORTS           USHRT_MAX +#define DP_VPORT_HASH_BUCKETS  1024 + +#define SAMPLE_ACTION_DEPTH 3 + +/** + * struct dp_stats_percpu - per-cpu packet processing statistics for a given + * datapath. + * @n_hit: Number of received packets for which a matching flow was found in + * the flow table. + * @n_miss: Number of received packets that had no matching flow in the flow + * table.  The sum of @n_hit and @n_miss is the number of packets that have + * been received by the datapath. + * @n_lost: Number of received packets that had no matching flow in the flow + * table that could not be sent to userspace (normally due to an overflow in + * one of the datapath's queues). + * @n_mask_hit: Number of masks looked up for flow match. + *   @n_mask_hit / (@n_hit + @n_missed)  will be the average masks looked + *   up per packet. + */ +struct dp_stats_percpu { +	u64 n_hit; +	u64 n_missed; +	u64 n_lost; +	u64 n_mask_hit; +	struct u64_stats_sync syncp; +}; + +/** + * struct datapath - datapath for flow-based packet switching + * @rcu: RCU callback head for deferred destruction. + * @list_node: Element in global 'dps' list. + * @table: flow table. + * @ports: Hash table for ports.  %OVSP_LOCAL port always exists.  Protected by + * ovs_mutex and RCU. + * @stats_percpu: Per-CPU datapath statistics. + * @net: Reference to net namespace. + * + * Context: See the comment on locking at the top of datapath.c for additional + * locking information. + */ +struct datapath { +	struct rcu_head rcu; +	struct list_head list_node; + +	/* Flow table. */ +	struct flow_table table; + +	/* Switch ports. */ +	struct hlist_head *ports; + +	/* Stats. */ +	struct dp_stats_percpu __percpu *stats_percpu; + +#ifdef CONFIG_NET_NS +	/* Network namespace ref. */ +	struct net *net; +#endif + +	u32 user_features; +}; + +/** + * struct ovs_skb_cb - OVS data in skb CB + * @flow: The flow associated with this packet.  May be %NULL if no flow. + * @pkt_key: The flow information extracted from the packet.  Must be nonnull. + * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the + * packet is not being tunneled. + */ +struct ovs_skb_cb { +	struct sw_flow		*flow; +	struct sw_flow_key	*pkt_key; +	struct ovs_key_ipv4_tunnel  *tun_key; +}; +#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) + +/** + * struct dp_upcall - metadata to include with a packet to send to userspace + * @cmd: One of %OVS_PACKET_CMD_*. + * @key: Becomes %OVS_PACKET_ATTR_KEY.  Must be nonnull. + * @userdata: If nonnull, its variable-length value is passed to userspace as + * %OVS_PACKET_ATTR_USERDATA. + * @pid: Netlink PID to which packet should be sent.  If @pid is 0 then no + * packet is sent and the packet is accounted in the datapath's @n_lost + * counter. + */ +struct dp_upcall_info { +	u8 cmd; +	const struct sw_flow_key *key; +	const struct nlattr *userdata; +	u32 portid; +}; + +/** + * struct ovs_net - Per net-namespace data for ovs. + * @dps: List of datapaths to enable dumping them all out. + * Protected by genl_mutex. + */ +struct ovs_net { +	struct list_head dps; +	struct work_struct dp_notify_work; +	struct vport_net vport_net; +}; + +extern int ovs_net_id; +void ovs_lock(void); +void ovs_unlock(void); + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void); +#else +#define lockdep_ovsl_is_held()	1 +#endif + +#define ASSERT_OVSL()		WARN_ON(unlikely(!lockdep_ovsl_is_held())) +#define ovsl_dereference(p)					\ +	rcu_dereference_protected(p, lockdep_ovsl_is_held()) +#define rcu_dereference_ovsl(p)					\ +	rcu_dereference_check(p, lockdep_ovsl_is_held()) + +static inline struct net *ovs_dp_get_net(struct datapath *dp) +{ +	return read_pnet(&dp->net); +} + +static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) +{ +	write_pnet(&dp->net, net); +} + +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); + +static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) +{ +	WARN_ON_ONCE(!rcu_read_lock_held()); +	return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) +{ +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); +	return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) +{ +	ASSERT_OVSL(); +	return ovs_lookup_vport(dp, port_no); +} + +extern struct notifier_block ovs_dp_device_notifier; +extern struct genl_family dp_vport_genl_family; + +void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_detach_port(struct vport *); +int ovs_dp_upcall(struct datapath *, struct sk_buff *, +		  const struct dp_upcall_info *); + +struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, +					 u8 cmd); + +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); +void ovs_dp_notify_wq(struct work_struct *work); + +#define OVS_NLERR(fmt, ...)					\ +do {								\ +	if (net_ratelimit())					\ +		pr_info("netlink: " fmt, ##__VA_ARGS__);	\ +} while (0) +#endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c new file mode 100644 index 00000000000..2c631fe76be --- /dev/null +++ b/net/openvswitch/dp_notify.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/netdevice.h> +#include <net/genetlink.h> +#include <net/netns/generic.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static void dp_detach_port_notify(struct vport *vport) +{ +	struct sk_buff *notify; +	struct datapath *dp; + +	dp = vport->dp; +	notify = ovs_vport_cmd_build_info(vport, 0, 0, +					  OVS_VPORT_CMD_DEL); +	ovs_dp_detach_port(vport); +	if (IS_ERR(notify)) { +		genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, +			     0, PTR_ERR(notify)); +		return; +	} + +	genlmsg_multicast_netns(&dp_vport_genl_family, +				ovs_dp_get_net(dp), notify, 0, +				0, GFP_KERNEL); +} + +void ovs_dp_notify_wq(struct work_struct *work) +{ +	struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); +	struct datapath *dp; + +	ovs_lock(); +	list_for_each_entry(dp, &ovs_net->dps, list_node) { +		int i; + +		for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { +			struct vport *vport; +			struct hlist_node *n; + +			hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { +				struct netdev_vport *netdev_vport; + +				if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) +					continue; + +				netdev_vport = netdev_vport_priv(vport); +				if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) +					dp_detach_port_notify(vport); +			} +		} +	} +	ovs_unlock(); +} + +static int dp_device_event(struct notifier_block *unused, unsigned long event, +			   void *ptr) +{ +	struct ovs_net *ovs_net; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct vport *vport = NULL; + +	if (!ovs_is_internal_dev(dev)) +		vport = ovs_netdev_get_vport(dev); + +	if (!vport) +		return NOTIFY_DONE; + +	if (event == NETDEV_UNREGISTER) { +		/* upper_dev_unlink and decrement promisc immediately */ +		ovs_netdev_detach_dev(vport); + +		/* schedule vport destroy, dev_put and genl notification */ +		ovs_net = net_generic(dev_net(dev), ovs_net_id); +		queue_work(system_wq, &ovs_net->dp_notify_work); +	} + +	return NOTIFY_DONE; +} + +struct notifier_block ovs_dp_device_notifier = { +	.notifier_call = dp_device_event +}; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c new file mode 100644 index 00000000000..d07ab538fc9 --- /dev/null +++ b/net/openvswitch/flow.c @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/smp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +u64 ovs_flow_used_time(unsigned long flow_jiffies) +{ +	struct timespec cur_ts; +	u64 cur_ms, idle_ms; + +	ktime_get_ts(&cur_ts); +	idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); +	cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + +		 cur_ts.tv_nsec / NSEC_PER_MSEC; + +	return cur_ms - idle_ms; +} + +#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) + +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, +			   struct sk_buff *skb) +{ +	struct flow_stats *stats; +	int node = numa_node_id(); + +	stats = rcu_dereference(flow->stats[node]); + +	/* Check if already have node-specific stats. */ +	if (likely(stats)) { +		spin_lock(&stats->lock); +		/* Mark if we write on the pre-allocated stats. */ +		if (node == 0 && unlikely(flow->stats_last_writer != node)) +			flow->stats_last_writer = node; +	} else { +		stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ +		spin_lock(&stats->lock); + +		/* If the current NUMA-node is the only writer on the +		 * pre-allocated stats keep using them. +		 */ +		if (unlikely(flow->stats_last_writer != node)) { +			/* A previous locker may have already allocated the +			 * stats, so we need to check again.  If node-specific +			 * stats were already allocated, we update the pre- +			 * allocated stats as we have already locked them. +			 */ +			if (likely(flow->stats_last_writer != NUMA_NO_NODE) +			    && likely(!rcu_dereference(flow->stats[node]))) { +				/* Try to allocate node-specific stats. */ +				struct flow_stats *new_stats; + +				new_stats = +					kmem_cache_alloc_node(flow_stats_cache, +							      GFP_THISNODE | +							      __GFP_NOMEMALLOC, +							      node); +				if (likely(new_stats)) { +					new_stats->used = jiffies; +					new_stats->packet_count = 1; +					new_stats->byte_count = skb->len; +					new_stats->tcp_flags = tcp_flags; +					spin_lock_init(&new_stats->lock); + +					rcu_assign_pointer(flow->stats[node], +							   new_stats); +					goto unlock; +				} +			} +			flow->stats_last_writer = node; +		} +	} + +	stats->used = jiffies; +	stats->packet_count++; +	stats->byte_count += skb->len; +	stats->tcp_flags |= tcp_flags; +unlock: +	spin_unlock(&stats->lock); +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +void ovs_flow_stats_get(const struct sw_flow *flow, +			struct ovs_flow_stats *ovs_stats, +			unsigned long *used, __be16 *tcp_flags) +{ +	int node; + +	*used = 0; +	*tcp_flags = 0; +	memset(ovs_stats, 0, sizeof(*ovs_stats)); + +	for_each_node(node) { +		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); + +		if (stats) { +			/* Local CPU may write on non-local stats, so we must +			 * block bottom-halves here. +			 */ +			spin_lock_bh(&stats->lock); +			if (!*used || time_after(stats->used, *used)) +				*used = stats->used; +			*tcp_flags |= stats->tcp_flags; +			ovs_stats->n_packets += stats->packet_count; +			ovs_stats->n_bytes += stats->byte_count; +			spin_unlock_bh(&stats->lock); +		} +	} +} + +/* Called with ovs_mutex. */ +void ovs_flow_stats_clear(struct sw_flow *flow) +{ +	int node; + +	for_each_node(node) { +		struct flow_stats *stats = ovsl_dereference(flow->stats[node]); + +		if (stats) { +			spin_lock_bh(&stats->lock); +			stats->used = 0; +			stats->packet_count = 0; +			stats->byte_count = 0; +			stats->tcp_flags = 0; +			spin_unlock_bh(&stats->lock); +		} +	} +} + +static int check_header(struct sk_buff *skb, int len) +{ +	if (unlikely(skb->len < len)) +		return -EINVAL; +	if (unlikely(!pskb_may_pull(skb, len))) +		return -ENOMEM; +	return 0; +} + +static bool arphdr_ok(struct sk_buff *skb) +{ +	return pskb_may_pull(skb, skb_network_offset(skb) + +				  sizeof(struct arp_eth_header)); +} + +static int check_iphdr(struct sk_buff *skb) +{ +	unsigned int nh_ofs = skb_network_offset(skb); +	unsigned int ip_len; +	int err; + +	err = check_header(skb, nh_ofs + sizeof(struct iphdr)); +	if (unlikely(err)) +		return err; + +	ip_len = ip_hdrlen(skb); +	if (unlikely(ip_len < sizeof(struct iphdr) || +		     skb->len < nh_ofs + ip_len)) +		return -EINVAL; + +	skb_set_transport_header(skb, nh_ofs + ip_len); +	return 0; +} + +static bool tcphdr_ok(struct sk_buff *skb) +{ +	int th_ofs = skb_transport_offset(skb); +	int tcp_len; + +	if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) +		return false; + +	tcp_len = tcp_hdrlen(skb); +	if (unlikely(tcp_len < sizeof(struct tcphdr) || +		     skb->len < th_ofs + tcp_len)) +		return false; + +	return true; +} + +static bool udphdr_ok(struct sk_buff *skb) +{ +	return pskb_may_pull(skb, skb_transport_offset(skb) + +				  sizeof(struct udphdr)); +} + +static bool sctphdr_ok(struct sk_buff *skb) +{ +	return pskb_may_pull(skb, skb_transport_offset(skb) + +				  sizeof(struct sctphdr)); +} + +static bool icmphdr_ok(struct sk_buff *skb) +{ +	return pskb_may_pull(skb, skb_transport_offset(skb) + +				  sizeof(struct icmphdr)); +} + +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) +{ +	unsigned int nh_ofs = skb_network_offset(skb); +	unsigned int nh_len; +	int payload_ofs; +	struct ipv6hdr *nh; +	uint8_t nexthdr; +	__be16 frag_off; +	int err; + +	err = check_header(skb, nh_ofs + sizeof(*nh)); +	if (unlikely(err)) +		return err; + +	nh = ipv6_hdr(skb); +	nexthdr = nh->nexthdr; +	payload_ofs = (u8 *)(nh + 1) - skb->data; + +	key->ip.proto = NEXTHDR_NONE; +	key->ip.tos = ipv6_get_dsfield(nh); +	key->ip.ttl = nh->hop_limit; +	key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); +	key->ipv6.addr.src = nh->saddr; +	key->ipv6.addr.dst = nh->daddr; + +	payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); +	if (unlikely(payload_ofs < 0)) +		return -EINVAL; + +	if (frag_off) { +		if (frag_off & htons(~0x7)) +			key->ip.frag = OVS_FRAG_TYPE_LATER; +		else +			key->ip.frag = OVS_FRAG_TYPE_FIRST; +	} + +	nh_len = payload_ofs - nh_ofs; +	skb_set_transport_header(skb, nh_ofs + nh_len); +	key->ip.proto = nexthdr; +	return nh_len; +} + +static bool icmp6hdr_ok(struct sk_buff *skb) +{ +	return pskb_may_pull(skb, skb_transport_offset(skb) + +				  sizeof(struct icmp6hdr)); +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ +	struct qtag_prefix { +		__be16 eth_type; /* ETH_P_8021Q */ +		__be16 tci; +	}; +	struct qtag_prefix *qp; + +	if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) +		return 0; + +	if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + +					 sizeof(__be16)))) +		return -ENOMEM; + +	qp = (struct qtag_prefix *) skb->data; +	key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); +	__skb_pull(skb, sizeof(struct qtag_prefix)); + +	return 0; +} + +static __be16 parse_ethertype(struct sk_buff *skb) +{ +	struct llc_snap_hdr { +		u8  dsap;  /* Always 0xAA */ +		u8  ssap;  /* Always 0xAA */ +		u8  ctrl; +		u8  oui[3]; +		__be16 ethertype; +	}; +	struct llc_snap_hdr *llc; +	__be16 proto; + +	proto = *(__be16 *) skb->data; +	__skb_pull(skb, sizeof(__be16)); + +	if (ntohs(proto) >= ETH_P_802_3_MIN) +		return proto; + +	if (skb->len < sizeof(struct llc_snap_hdr)) +		return htons(ETH_P_802_2); + +	if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) +		return htons(0); + +	llc = (struct llc_snap_hdr *) skb->data; +	if (llc->dsap != LLC_SAP_SNAP || +	    llc->ssap != LLC_SAP_SNAP || +	    (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) +		return htons(ETH_P_802_2); + +	__skb_pull(skb, sizeof(struct llc_snap_hdr)); + +	if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) +		return llc->ethertype; + +	return htons(ETH_P_802_2); +} + +static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, +			int nh_len) +{ +	struct icmp6hdr *icmp = icmp6_hdr(skb); + +	/* The ICMPv6 type and code fields use the 16-bit transport port +	 * fields, so we need to store them in 16-bit network byte order. +	 */ +	key->tp.src = htons(icmp->icmp6_type); +	key->tp.dst = htons(icmp->icmp6_code); + +	if (icmp->icmp6_code == 0 && +	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || +	     icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { +		int icmp_len = skb->len - skb_transport_offset(skb); +		struct nd_msg *nd; +		int offset; + +		/* In order to process neighbor discovery options, we need the +		 * entire packet. +		 */ +		if (unlikely(icmp_len < sizeof(*nd))) +			return 0; + +		if (unlikely(skb_linearize(skb))) +			return -ENOMEM; + +		nd = (struct nd_msg *)skb_transport_header(skb); +		key->ipv6.nd.target = nd->target; + +		icmp_len -= sizeof(*nd); +		offset = 0; +		while (icmp_len >= 8) { +			struct nd_opt_hdr *nd_opt = +				 (struct nd_opt_hdr *)(nd->opt + offset); +			int opt_len = nd_opt->nd_opt_len * 8; + +			if (unlikely(!opt_len || opt_len > icmp_len)) +				return 0; + +			/* Store the link layer address if the appropriate +			 * option is provided.  It is considered an error if +			 * the same link layer option is specified twice. +			 */ +			if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR +			    && opt_len == 8) { +				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) +					goto invalid; +				ether_addr_copy(key->ipv6.nd.sll, +						&nd->opt[offset+sizeof(*nd_opt)]); +			} else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR +				   && opt_len == 8) { +				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) +					goto invalid; +				ether_addr_copy(key->ipv6.nd.tll, +						&nd->opt[offset+sizeof(*nd_opt)]); +			} + +			icmp_len -= opt_len; +			offset += opt_len; +		} +	} + +	return 0; + +invalid: +	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); +	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); +	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); + +	return 0; +} + +/** + * ovs_flow_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @in_port: port number on which @skb was received. + * @key: output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Returns 0 if successful, otherwise a negative errno value. + * + * Initializes @skb header pointers as follows: + * + *    - skb->mac_header: the Ethernet header. + * + *    - skb->network_header: just past the Ethernet header, or just past the + *      VLAN header, to the first byte of the Ethernet payload. + * + *    - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 + *      on output, then just past the IP header, if one is present and + *      of a correct length, otherwise the same as skb->network_header. + *      For other key->eth.type values it is left untouched. + */ +int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) +{ +	int error; +	struct ethhdr *eth; + +	memset(key, 0, sizeof(*key)); + +	key->phy.priority = skb->priority; +	if (OVS_CB(skb)->tun_key) +		memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); +	key->phy.in_port = in_port; +	key->phy.skb_mark = skb->mark; + +	skb_reset_mac_header(skb); + +	/* Link layer.  We are guaranteed to have at least the 14 byte Ethernet +	 * header in the linear data area. +	 */ +	eth = eth_hdr(skb); +	ether_addr_copy(key->eth.src, eth->h_source); +	ether_addr_copy(key->eth.dst, eth->h_dest); + +	__skb_pull(skb, 2 * ETH_ALEN); +	/* We are going to push all headers that we pull, so no need to +	 * update skb->csum here. +	 */ + +	if (vlan_tx_tag_present(skb)) +		key->eth.tci = htons(skb->vlan_tci); +	else if (eth->h_proto == htons(ETH_P_8021Q)) +		if (unlikely(parse_vlan(skb, key))) +			return -ENOMEM; + +	key->eth.type = parse_ethertype(skb); +	if (unlikely(key->eth.type == htons(0))) +		return -ENOMEM; + +	skb_reset_network_header(skb); +	__skb_push(skb, skb->data - skb_mac_header(skb)); + +	/* Network layer. */ +	if (key->eth.type == htons(ETH_P_IP)) { +		struct iphdr *nh; +		__be16 offset; + +		error = check_iphdr(skb); +		if (unlikely(error)) { +			if (error == -EINVAL) { +				skb->transport_header = skb->network_header; +				error = 0; +			} +			return error; +		} + +		nh = ip_hdr(skb); +		key->ipv4.addr.src = nh->saddr; +		key->ipv4.addr.dst = nh->daddr; + +		key->ip.proto = nh->protocol; +		key->ip.tos = nh->tos; +		key->ip.ttl = nh->ttl; + +		offset = nh->frag_off & htons(IP_OFFSET); +		if (offset) { +			key->ip.frag = OVS_FRAG_TYPE_LATER; +			return 0; +		} +		if (nh->frag_off & htons(IP_MF) || +			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP) +			key->ip.frag = OVS_FRAG_TYPE_FIRST; + +		/* Transport layer. */ +		if (key->ip.proto == IPPROTO_TCP) { +			if (tcphdr_ok(skb)) { +				struct tcphdr *tcp = tcp_hdr(skb); +				key->tp.src = tcp->source; +				key->tp.dst = tcp->dest; +				key->tp.flags = TCP_FLAGS_BE16(tcp); +			} +		} else if (key->ip.proto == IPPROTO_UDP) { +			if (udphdr_ok(skb)) { +				struct udphdr *udp = udp_hdr(skb); +				key->tp.src = udp->source; +				key->tp.dst = udp->dest; +			} +		} else if (key->ip.proto == IPPROTO_SCTP) { +			if (sctphdr_ok(skb)) { +				struct sctphdr *sctp = sctp_hdr(skb); +				key->tp.src = sctp->source; +				key->tp.dst = sctp->dest; +			} +		} else if (key->ip.proto == IPPROTO_ICMP) { +			if (icmphdr_ok(skb)) { +				struct icmphdr *icmp = icmp_hdr(skb); +				/* The ICMP type and code fields use the 16-bit +				 * transport port fields, so we need to store +				 * them in 16-bit network byte order. */ +				key->tp.src = htons(icmp->type); +				key->tp.dst = htons(icmp->code); +			} +		} + +	} else if ((key->eth.type == htons(ETH_P_ARP) || +		   key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) { +		struct arp_eth_header *arp; + +		arp = (struct arp_eth_header *)skb_network_header(skb); + +		if (arp->ar_hrd == htons(ARPHRD_ETHER) +				&& arp->ar_pro == htons(ETH_P_IP) +				&& arp->ar_hln == ETH_ALEN +				&& arp->ar_pln == 4) { + +			/* We only match on the lower 8 bits of the opcode. */ +			if (ntohs(arp->ar_op) <= 0xff) +				key->ip.proto = ntohs(arp->ar_op); +			memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); +			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); +			ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); +			ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); +		} +	} else if (key->eth.type == htons(ETH_P_IPV6)) { +		int nh_len;             /* IPv6 Header + Extensions */ + +		nh_len = parse_ipv6hdr(skb, key); +		if (unlikely(nh_len < 0)) { +			if (nh_len == -EINVAL) { +				skb->transport_header = skb->network_header; +				error = 0; +			} else { +				error = nh_len; +			} +			return error; +		} + +		if (key->ip.frag == OVS_FRAG_TYPE_LATER) +			return 0; +		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) +			key->ip.frag = OVS_FRAG_TYPE_FIRST; + +		/* Transport layer. */ +		if (key->ip.proto == NEXTHDR_TCP) { +			if (tcphdr_ok(skb)) { +				struct tcphdr *tcp = tcp_hdr(skb); +				key->tp.src = tcp->source; +				key->tp.dst = tcp->dest; +				key->tp.flags = TCP_FLAGS_BE16(tcp); +			} +		} else if (key->ip.proto == NEXTHDR_UDP) { +			if (udphdr_ok(skb)) { +				struct udphdr *udp = udp_hdr(skb); +				key->tp.src = udp->source; +				key->tp.dst = udp->dest; +			} +		} else if (key->ip.proto == NEXTHDR_SCTP) { +			if (sctphdr_ok(skb)) { +				struct sctphdr *sctp = sctp_hdr(skb); +				key->tp.src = sctp->source; +				key->tp.dst = sctp->dest; +			} +		} else if (key->ip.proto == NEXTHDR_ICMP) { +			if (icmp6hdr_ok(skb)) { +				error = parse_icmpv6(skb, key, nh_len); +				if (error) +					return error; +			} +		} +	} + +	return 0; +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h new file mode 100644 index 00000000000..5e5aaed3a85 --- /dev/null +++ b/net/openvswitch/flow.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_H +#define FLOW_H 1 + +#include <linux/cache.h> +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> +#include <net/inet_ecn.h> + +struct sk_buff; + +/* Used to memset ovs_key_ipv4_tunnel padding. */ +#define OVS_TUNNEL_KEY_SIZE					\ +	(offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) +	\ +	FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl)) + +struct ovs_key_ipv4_tunnel { +	__be64 tun_id; +	__be32 ipv4_src; +	__be32 ipv4_dst; +	__be16 tun_flags; +	u8   ipv4_tos; +	u8   ipv4_ttl; +} __packed __aligned(4); /* Minimize padding. */ + +static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, +					 const struct iphdr *iph, __be64 tun_id, +					 __be16 tun_flags) +{ +	tun_key->tun_id = tun_id; +	tun_key->ipv4_src = iph->saddr; +	tun_key->ipv4_dst = iph->daddr; +	tun_key->ipv4_tos = iph->tos; +	tun_key->ipv4_ttl = iph->ttl; +	tun_key->tun_flags = tun_flags; + +	/* clear struct padding. */ +	memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, +	       sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); +} + +struct sw_flow_key { +	struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */ +	struct { +		u32	priority;	/* Packet QoS priority. */ +		u32	skb_mark;	/* SKB mark. */ +		u16	in_port;	/* Input switch port (or DP_MAX_PORTS). */ +	} __packed phy; /* Safe when right after 'tun_key'. */ +	struct { +		u8     src[ETH_ALEN];	/* Ethernet source address. */ +		u8     dst[ETH_ALEN];	/* Ethernet destination address. */ +		__be16 tci;		/* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ +		__be16 type;		/* Ethernet frame type. */ +	} eth; +	struct { +		u8     proto;		/* IP protocol or lower 8 bits of ARP opcode. */ +		u8     tos;		/* IP ToS. */ +		u8     ttl;		/* IP TTL/hop limit. */ +		u8     frag;		/* One of OVS_FRAG_TYPE_*. */ +	} ip; +	struct { +		__be16 src;		/* TCP/UDP/SCTP source port. */ +		__be16 dst;		/* TCP/UDP/SCTP destination port. */ +		__be16 flags;		/* TCP flags. */ +	} tp; +	union { +		struct { +			struct { +				__be32 src;	/* IP source address. */ +				__be32 dst;	/* IP destination address. */ +			} addr; +			struct { +				u8 sha[ETH_ALEN];	/* ARP source hardware address. */ +				u8 tha[ETH_ALEN];	/* ARP target hardware address. */ +			} arp; +		} ipv4; +		struct { +			struct { +				struct in6_addr src;	/* IPv6 source address. */ +				struct in6_addr dst;	/* IPv6 destination address. */ +			} addr; +			__be32 label;			/* IPv6 flow label. */ +			struct { +				struct in6_addr target;	/* ND target address. */ +				u8 sll[ETH_ALEN];	/* ND source link layer address. */ +				u8 tll[ETH_ALEN];	/* ND target link layer address. */ +			} nd; +		} ipv6; +	}; +} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ + +struct sw_flow_key_range { +	unsigned short int start; +	unsigned short int end; +}; + +struct sw_flow_mask { +	int ref_count; +	struct rcu_head rcu; +	struct list_head list; +	struct sw_flow_key_range range; +	struct sw_flow_key key; +}; + +struct sw_flow_match { +	struct sw_flow_key *key; +	struct sw_flow_key_range range; +	struct sw_flow_mask *mask; +}; + +struct sw_flow_actions { +	struct rcu_head rcu; +	u32 actions_len; +	struct nlattr actions[]; +}; + +struct flow_stats { +	u64 packet_count;		/* Number of packets matched. */ +	u64 byte_count;			/* Number of bytes matched. */ +	unsigned long used;		/* Last used time (in jiffies). */ +	spinlock_t lock;		/* Lock for atomic stats update. */ +	__be16 tcp_flags;		/* Union of seen TCP flags. */ +}; + +struct sw_flow { +	struct rcu_head rcu; +	struct hlist_node hash_node[2]; +	u32 hash; +	int stats_last_writer;		/* NUMA-node id of the last writer on +					 * 'stats[0]'. +					 */ +	struct sw_flow_key key; +	struct sw_flow_key unmasked_key; +	struct sw_flow_mask *mask; +	struct sw_flow_actions __rcu *sf_acts; +	struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one +					   * is allocated at flow creation time, +					   * the rest are allocated on demand +					   * while holding the 'stats[0].lock'. +					   */ +}; + +struct arp_eth_header { +	__be16      ar_hrd;	/* format of hardware address   */ +	__be16      ar_pro;	/* format of protocol address   */ +	unsigned char   ar_hln;	/* length of hardware address   */ +	unsigned char   ar_pln;	/* length of protocol address   */ +	__be16      ar_op;	/* ARP opcode (command)     */ + +	/* Ethernet+IPv4 specific members. */ +	unsigned char       ar_sha[ETH_ALEN];	/* sender hardware address  */ +	unsigned char       ar_sip[4];		/* sender IP address        */ +	unsigned char       ar_tha[ETH_ALEN];	/* target hardware address  */ +	unsigned char       ar_tip[4];		/* target IP address        */ +} __packed; + +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, +			   struct sk_buff *); +void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, +			unsigned long *used, __be16 *tcp_flags); +void ovs_flow_stats_clear(struct sw_flow *); +u64 ovs_flow_used_time(unsigned long flow_jiffies); + +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); + +#endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c new file mode 100644 index 00000000000..d757848da89 --- /dev/null +++ b/net/openvswitch/flow_netlink.c @@ -0,0 +1,1576 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +#include "flow_netlink.h" + +static void update_range__(struct sw_flow_match *match, +			   size_t offset, size_t size, bool is_mask) +{ +	struct sw_flow_key_range *range = NULL; +	size_t start = rounddown(offset, sizeof(long)); +	size_t end = roundup(offset + size, sizeof(long)); + +	if (!is_mask) +		range = &match->range; +	else if (match->mask) +		range = &match->mask->range; + +	if (!range) +		return; + +	if (range->start == range->end) { +		range->start = start; +		range->end = end; +		return; +	} + +	if (range->start > start) +		range->start = start; + +	if (range->end < end) +		range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ +	do { \ +		update_range__(match, offsetof(struct sw_flow_key, field),  \ +				     sizeof((match)->key->field), is_mask); \ +		if (is_mask) {						    \ +			if ((match)->mask)				    \ +				(match)->mask->key.field = value;	    \ +		} else {                                                    \ +			(match)->key->field = value;		            \ +		}                                                           \ +	} while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ +	do { \ +		update_range__(match, offsetof(struct sw_flow_key, field),  \ +				len, is_mask);                              \ +		if (is_mask) {						    \ +			if ((match)->mask)				    \ +				memcpy(&(match)->mask->key.field, value_p, len);\ +		} else {                                                    \ +			memcpy(&(match)->key->field, value_p, len);         \ +		}                                                           \ +	} while (0) + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ +	return range->end - range->start; +} + +static bool match_validate(const struct sw_flow_match *match, +			   u64 key_attrs, u64 mask_attrs) +{ +	u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; +	u64 mask_allowed = key_attrs;  /* At most allow all key attributes */ + +	/* The following mask attributes allowed only if they +	 * pass the validation tests. */ +	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) +			| (1 << OVS_KEY_ATTR_IPV6) +			| (1 << OVS_KEY_ATTR_TCP) +			| (1 << OVS_KEY_ATTR_TCP_FLAGS) +			| (1 << OVS_KEY_ATTR_UDP) +			| (1 << OVS_KEY_ATTR_SCTP) +			| (1 << OVS_KEY_ATTR_ICMP) +			| (1 << OVS_KEY_ATTR_ICMPV6) +			| (1 << OVS_KEY_ATTR_ARP) +			| (1 << OVS_KEY_ATTR_ND)); + +	/* Always allowed mask fields. */ +	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) +		       | (1 << OVS_KEY_ATTR_IN_PORT) +		       | (1 << OVS_KEY_ATTR_ETHERTYPE)); + +	/* Check key attributes. */ +	if (match->key->eth.type == htons(ETH_P_ARP) +			|| match->key->eth.type == htons(ETH_P_RARP)) { +		key_expected |= 1 << OVS_KEY_ATTR_ARP; +		if (match->mask && (match->mask->key.eth.type == htons(0xffff))) +			mask_allowed |= 1 << OVS_KEY_ATTR_ARP; +	} + +	if (match->key->eth.type == htons(ETH_P_IP)) { +		key_expected |= 1 << OVS_KEY_ATTR_IPV4; +		if (match->mask && (match->mask->key.eth.type == htons(0xffff))) +			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + +		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { +			if (match->key->ip.proto == IPPROTO_UDP) { +				key_expected |= 1 << OVS_KEY_ATTR_UDP; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) +					mask_allowed |= 1 << OVS_KEY_ATTR_UDP; +			} + +			if (match->key->ip.proto == IPPROTO_SCTP) { +				key_expected |= 1 << OVS_KEY_ATTR_SCTP; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) +					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; +			} + +			if (match->key->ip.proto == IPPROTO_TCP) { +				key_expected |= 1 << OVS_KEY_ATTR_TCP; +				key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) { +					mask_allowed |= 1 << OVS_KEY_ATTR_TCP; +					mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; +				} +			} + +			if (match->key->ip.proto == IPPROTO_ICMP) { +				key_expected |= 1 << OVS_KEY_ATTR_ICMP; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) +					mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; +			} +		} +	} + +	if (match->key->eth.type == htons(ETH_P_IPV6)) { +		key_expected |= 1 << OVS_KEY_ATTR_IPV6; +		if (match->mask && (match->mask->key.eth.type == htons(0xffff))) +			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + +		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { +			if (match->key->ip.proto == IPPROTO_UDP) { +				key_expected |= 1 << OVS_KEY_ATTR_UDP; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) +					mask_allowed |= 1 << OVS_KEY_ATTR_UDP; +			} + +			if (match->key->ip.proto == IPPROTO_SCTP) { +				key_expected |= 1 << OVS_KEY_ATTR_SCTP; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) +					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; +			} + +			if (match->key->ip.proto == IPPROTO_TCP) { +				key_expected |= 1 << OVS_KEY_ATTR_TCP; +				key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) { +					mask_allowed |= 1 << OVS_KEY_ATTR_TCP; +					mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; +				} +			} + +			if (match->key->ip.proto == IPPROTO_ICMPV6) { +				key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; +				if (match->mask && (match->mask->key.ip.proto == 0xff)) +					mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + +				if (match->key->tp.src == +						htons(NDISC_NEIGHBOUR_SOLICITATION) || +				    match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { +					key_expected |= 1 << OVS_KEY_ATTR_ND; +					if (match->mask && (match->mask->key.tp.src == htons(0xffff))) +						mask_allowed |= 1 << OVS_KEY_ATTR_ND; +				} +			} +		} +	} + +	if ((key_attrs & key_expected) != key_expected) { +		/* Key attributes check failed. */ +		OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", +				(unsigned long long)key_attrs, (unsigned long long)key_expected); +		return false; +	} + +	if ((mask_attrs & mask_allowed) != mask_attrs) { +		/* Mask attributes check failed. */ +		OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", +				(unsigned long long)mask_attrs, (unsigned long long)mask_allowed); +		return false; +	} + +	return true; +} + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */ +static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { +	[OVS_KEY_ATTR_ENCAP] = -1, +	[OVS_KEY_ATTR_PRIORITY] = sizeof(u32), +	[OVS_KEY_ATTR_IN_PORT] = sizeof(u32), +	[OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), +	[OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), +	[OVS_KEY_ATTR_VLAN] = sizeof(__be16), +	[OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), +	[OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), +	[OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), +	[OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), +	[OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16), +	[OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), +	[OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), +	[OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), +	[OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), +	[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), +	[OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), +	[OVS_KEY_ATTR_TUNNEL] = -1, +}; + +static bool is_all_zero(const u8 *fp, size_t size) +{ +	int i; + +	if (!fp) +		return false; + +	for (i = 0; i < size; i++) +		if (fp[i]) +			return false; + +	return true; +} + +static int __parse_flow_nlattrs(const struct nlattr *attr, +				const struct nlattr *a[], +				u64 *attrsp, bool nz) +{ +	const struct nlattr *nla; +	u64 attrs; +	int rem; + +	attrs = *attrsp; +	nla_for_each_nested(nla, attr, rem) { +		u16 type = nla_type(nla); +		int expected_len; + +		if (type > OVS_KEY_ATTR_MAX) { +			OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", +				  type, OVS_KEY_ATTR_MAX); +			return -EINVAL; +		} + +		if (attrs & (1 << type)) { +			OVS_NLERR("Duplicate key attribute (type %d).\n", type); +			return -EINVAL; +		} + +		expected_len = ovs_key_lens[type]; +		if (nla_len(nla) != expected_len && expected_len != -1) { +			OVS_NLERR("Key attribute has unexpected length (type=%d" +				  ", length=%d, expected=%d).\n", type, +				  nla_len(nla), expected_len); +			return -EINVAL; +		} + +		if (!nz || !is_all_zero(nla_data(nla), expected_len)) { +			attrs |= 1 << type; +			a[type] = nla; +		} +	} +	if (rem) { +		OVS_NLERR("Message has %d unknown bytes.\n", rem); +		return -EINVAL; +	} + +	*attrsp = attrs; +	return 0; +} + +static int parse_flow_mask_nlattrs(const struct nlattr *attr, +				   const struct nlattr *a[], u64 *attrsp) +{ +	return __parse_flow_nlattrs(attr, a, attrsp, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, +			      const struct nlattr *a[], u64 *attrsp) +{ +	return __parse_flow_nlattrs(attr, a, attrsp, false); +} + +static int ipv4_tun_from_nlattr(const struct nlattr *attr, +				struct sw_flow_match *match, bool is_mask) +{ +	struct nlattr *a; +	int rem; +	bool ttl = false; +	__be16 tun_flags = 0; + +	nla_for_each_nested(a, attr, rem) { +		int type = nla_type(a); +		static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { +			[OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), +			[OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), +			[OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), +			[OVS_TUNNEL_KEY_ATTR_TOS] = 1, +			[OVS_TUNNEL_KEY_ATTR_TTL] = 1, +			[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, +			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0, +		}; + +		if (type > OVS_TUNNEL_KEY_ATTR_MAX) { +			OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", +			type, OVS_TUNNEL_KEY_ATTR_MAX); +			return -EINVAL; +		} + +		if (ovs_tunnel_key_lens[type] != nla_len(a)) { +			OVS_NLERR("IPv4 tunnel attribute type has unexpected " +				  " length (type=%d, length=%d, expected=%d).\n", +				  type, nla_len(a), ovs_tunnel_key_lens[type]); +			return -EINVAL; +		} + +		switch (type) { +		case OVS_TUNNEL_KEY_ATTR_ID: +			SW_FLOW_KEY_PUT(match, tun_key.tun_id, +					nla_get_be64(a), is_mask); +			tun_flags |= TUNNEL_KEY; +			break; +		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: +			SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, +					nla_get_be32(a), is_mask); +			break; +		case OVS_TUNNEL_KEY_ATTR_IPV4_DST: +			SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, +					nla_get_be32(a), is_mask); +			break; +		case OVS_TUNNEL_KEY_ATTR_TOS: +			SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, +					nla_get_u8(a), is_mask); +			break; +		case OVS_TUNNEL_KEY_ATTR_TTL: +			SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, +					nla_get_u8(a), is_mask); +			ttl = true; +			break; +		case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: +			tun_flags |= TUNNEL_DONT_FRAGMENT; +			break; +		case OVS_TUNNEL_KEY_ATTR_CSUM: +			tun_flags |= TUNNEL_CSUM; +			break; +		default: +			return -EINVAL; +		} +	} + +	SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + +	if (rem > 0) { +		OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); +		return -EINVAL; +	} + +	if (!is_mask) { +		if (!match->key->tun_key.ipv4_dst) { +			OVS_NLERR("IPv4 tunnel destination address is zero.\n"); +			return -EINVAL; +		} + +		if (!ttl) { +			OVS_NLERR("IPv4 tunnel TTL not specified.\n"); +			return -EINVAL; +		} +	} + +	return 0; +} + +static int ipv4_tun_to_nlattr(struct sk_buff *skb, +			      const struct ovs_key_ipv4_tunnel *tun_key, +			      const struct ovs_key_ipv4_tunnel *output) +{ +	struct nlattr *nla; + +	nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); +	if (!nla) +		return -EMSGSIZE; + +	if (output->tun_flags & TUNNEL_KEY && +	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) +		return -EMSGSIZE; +	if (output->ipv4_src && +		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) +		return -EMSGSIZE; +	if (output->ipv4_dst && +		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) +		return -EMSGSIZE; +	if (output->ipv4_tos && +		nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) +		return -EMSGSIZE; +	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) +		return -EMSGSIZE; +	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && +		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) +		return -EMSGSIZE; +	if ((output->tun_flags & TUNNEL_CSUM) && +		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) +		return -EMSGSIZE; + +	nla_nest_end(skb, nla); +	return 0; +} + + +static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs, +				 const struct nlattr **a, bool is_mask) +{ +	if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { +		SW_FLOW_KEY_PUT(match, phy.priority, +			  nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); +		*attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); +	} + +	if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { +		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + +		if (is_mask) +			in_port = 0xffffffff; /* Always exact match in_port. */ +		else if (in_port >= DP_MAX_PORTS) +			return -EINVAL; + +		SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); +		*attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); +	} else if (!is_mask) { +		SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); +	} + +	if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { +		uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + +		SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); +		*attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); +	} +	if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { +		if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, +					 is_mask)) +			return -EINVAL; +		*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); +	} +	return 0; +} + +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, +				const struct nlattr **a, bool is_mask) +{ +	int err; +	u64 orig_attrs = attrs; + +	err = metadata_from_nlattrs(match, &attrs, a, is_mask); +	if (err) +		return err; + +	if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { +		const struct ovs_key_ethernet *eth_key; + +		eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); +		SW_FLOW_KEY_MEMCPY(match, eth.src, +				eth_key->eth_src, ETH_ALEN, is_mask); +		SW_FLOW_KEY_MEMCPY(match, eth.dst, +				eth_key->eth_dst, ETH_ALEN, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { +		__be16 tci; + +		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); +		if (!(tci & htons(VLAN_TAG_PRESENT))) { +			if (is_mask) +				OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); +			else +				OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); + +			return -EINVAL; +		} + +		SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_VLAN); +	} else if (!is_mask) +		SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + +	if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { +		__be16 eth_type; + +		eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); +		if (is_mask) { +			/* Always exact match EtherType. */ +			eth_type = htons(0xffff); +		} else if (ntohs(eth_type) < ETH_P_802_3_MIN) { +			OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", +					ntohs(eth_type), ETH_P_802_3_MIN); +			return -EINVAL; +		} + +		SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); +	} else if (!is_mask) { +		SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { +		const struct ovs_key_ipv4 *ipv4_key; + +		ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); +		if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { +			OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", +				ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); +			return -EINVAL; +		} +		SW_FLOW_KEY_PUT(match, ip.proto, +				ipv4_key->ipv4_proto, is_mask); +		SW_FLOW_KEY_PUT(match, ip.tos, +				ipv4_key->ipv4_tos, is_mask); +		SW_FLOW_KEY_PUT(match, ip.ttl, +				ipv4_key->ipv4_ttl, is_mask); +		SW_FLOW_KEY_PUT(match, ip.frag, +				ipv4_key->ipv4_frag, is_mask); +		SW_FLOW_KEY_PUT(match, ipv4.addr.src, +				ipv4_key->ipv4_src, is_mask); +		SW_FLOW_KEY_PUT(match, ipv4.addr.dst, +				ipv4_key->ipv4_dst, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_IPV4); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { +		const struct ovs_key_ipv6 *ipv6_key; + +		ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); +		if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { +			OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", +				ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); +			return -EINVAL; +		} +		SW_FLOW_KEY_PUT(match, ipv6.label, +				ipv6_key->ipv6_label, is_mask); +		SW_FLOW_KEY_PUT(match, ip.proto, +				ipv6_key->ipv6_proto, is_mask); +		SW_FLOW_KEY_PUT(match, ip.tos, +				ipv6_key->ipv6_tclass, is_mask); +		SW_FLOW_KEY_PUT(match, ip.ttl, +				ipv6_key->ipv6_hlimit, is_mask); +		SW_FLOW_KEY_PUT(match, ip.frag, +				ipv6_key->ipv6_frag, is_mask); +		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, +				ipv6_key->ipv6_src, +				sizeof(match->key->ipv6.addr.src), +				is_mask); +		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, +				ipv6_key->ipv6_dst, +				sizeof(match->key->ipv6.addr.dst), +				is_mask); + +		attrs &= ~(1 << OVS_KEY_ATTR_IPV6); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_ARP)) { +		const struct ovs_key_arp *arp_key; + +		arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); +		if (!is_mask && (arp_key->arp_op & htons(0xff00))) { +			OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", +				  arp_key->arp_op); +			return -EINVAL; +		} + +		SW_FLOW_KEY_PUT(match, ipv4.addr.src, +				arp_key->arp_sip, is_mask); +		SW_FLOW_KEY_PUT(match, ipv4.addr.dst, +			arp_key->arp_tip, is_mask); +		SW_FLOW_KEY_PUT(match, ip.proto, +				ntohs(arp_key->arp_op), is_mask); +		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, +				arp_key->arp_sha, ETH_ALEN, is_mask); +		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, +				arp_key->arp_tha, ETH_ALEN, is_mask); + +		attrs &= ~(1 << OVS_KEY_ATTR_ARP); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_TCP)) { +		const struct ovs_key_tcp *tcp_key; + +		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); +		SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); +		SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_TCP); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { +		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { +			SW_FLOW_KEY_PUT(match, tp.flags, +					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), +					is_mask); +		} else { +			SW_FLOW_KEY_PUT(match, tp.flags, +					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), +					is_mask); +		} +		attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_UDP)) { +		const struct ovs_key_udp *udp_key; + +		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); +		SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); +		SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_UDP); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { +		const struct ovs_key_sctp *sctp_key; + +		sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); +		SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); +		SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_SCTP); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { +		const struct ovs_key_icmp *icmp_key; + +		icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); +		SW_FLOW_KEY_PUT(match, tp.src, +				htons(icmp_key->icmp_type), is_mask); +		SW_FLOW_KEY_PUT(match, tp.dst, +				htons(icmp_key->icmp_code), is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_ICMP); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { +		const struct ovs_key_icmpv6 *icmpv6_key; + +		icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); +		SW_FLOW_KEY_PUT(match, tp.src, +				htons(icmpv6_key->icmpv6_type), is_mask); +		SW_FLOW_KEY_PUT(match, tp.dst, +				htons(icmpv6_key->icmpv6_code), is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); +	} + +	if (attrs & (1 << OVS_KEY_ATTR_ND)) { +		const struct ovs_key_nd *nd_key; + +		nd_key = nla_data(a[OVS_KEY_ATTR_ND]); +		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, +			nd_key->nd_target, +			sizeof(match->key->ipv6.nd.target), +			is_mask); +		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, +			nd_key->nd_sll, ETH_ALEN, is_mask); +		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, +				nd_key->nd_tll, ETH_ALEN, is_mask); +		attrs &= ~(1 << OVS_KEY_ATTR_ND); +	} + +	if (attrs != 0) +		return -EINVAL; + +	return 0; +} + +static void sw_flow_mask_set(struct sw_flow_mask *mask, +			     struct sw_flow_key_range *range, u8 val) +{ +	u8 *m = (u8 *)&mask->key + range->start; + +	mask->range = *range; +	memset(m, val, range_n_bytes(range)); +} + +/** + * ovs_nla_get_match - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + */ +int ovs_nla_get_match(struct sw_flow_match *match, +		      const struct nlattr *key, +		      const struct nlattr *mask) +{ +	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; +	const struct nlattr *encap; +	u64 key_attrs = 0; +	u64 mask_attrs = 0; +	bool encap_valid = false; +	int err; + +	err = parse_flow_nlattrs(key, a, &key_attrs); +	if (err) +		return err; + +	if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && +	    (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && +	    (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { +		__be16 tci; + +		if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && +		      (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { +			OVS_NLERR("Invalid Vlan frame.\n"); +			return -EINVAL; +		} + +		key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); +		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); +		encap = a[OVS_KEY_ATTR_ENCAP]; +		key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); +		encap_valid = true; + +		if (tci & htons(VLAN_TAG_PRESENT)) { +			err = parse_flow_nlattrs(encap, a, &key_attrs); +			if (err) +				return err; +		} else if (!tci) { +			/* Corner case for truncated 802.1Q header. */ +			if (nla_len(encap)) { +				OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); +				return -EINVAL; +			} +		} else { +			OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); +			return  -EINVAL; +		} +	} + +	err = ovs_key_from_nlattrs(match, key_attrs, a, false); +	if (err) +		return err; + +	if (mask) { +		err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); +		if (err) +			return err; + +		if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP)  { +			__be16 eth_type = 0; +			__be16 tci = 0; + +			if (!encap_valid) { +				OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); +				return  -EINVAL; +			} + +			mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); +			if (a[OVS_KEY_ATTR_ETHERTYPE]) +				eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + +			if (eth_type == htons(0xffff)) { +				mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); +				encap = a[OVS_KEY_ATTR_ENCAP]; +				err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); +			} else { +				OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", +						ntohs(eth_type)); +				return -EINVAL; +			} + +			if (a[OVS_KEY_ATTR_VLAN]) +				tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + +			if (!(tci & htons(VLAN_TAG_PRESENT))) { +				OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); +				return -EINVAL; +			} +		} + +		err = ovs_key_from_nlattrs(match, mask_attrs, a, true); +		if (err) +			return err; +	} else { +		/* Populate exact match flow's key mask. */ +		if (match->mask) +			sw_flow_mask_set(match->mask, &match->range, 0xff); +	} + +	if (!match_validate(match, key_attrs, mask_attrs)) +		return -EINVAL; + +	return 0; +} + +/** + * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. + * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ + +int ovs_nla_get_flow_metadata(struct sw_flow *flow, +			      const struct nlattr *attr) +{ +	struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; +	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; +	u64 attrs = 0; +	int err; +	struct sw_flow_match match; + +	flow->key.phy.in_port = DP_MAX_PORTS; +	flow->key.phy.priority = 0; +	flow->key.phy.skb_mark = 0; +	memset(tun_key, 0, sizeof(flow->key.tun_key)); + +	err = parse_flow_nlattrs(attr, a, &attrs); +	if (err) +		return -EINVAL; + +	memset(&match, 0, sizeof(match)); +	match.key = &flow->key; + +	err = metadata_from_nlattrs(&match, &attrs, a, false); +	if (err) +		return err; + +	return 0; +} + +int ovs_nla_put_flow(const struct sw_flow_key *swkey, +		     const struct sw_flow_key *output, struct sk_buff *skb) +{ +	struct ovs_key_ethernet *eth_key; +	struct nlattr *nla, *encap; +	bool is_mask = (swkey != output); + +	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) +		goto nla_put_failure; + +	if ((swkey->tun_key.ipv4_dst || is_mask) && +	    ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) +		goto nla_put_failure; + +	if (swkey->phy.in_port == DP_MAX_PORTS) { +		if (is_mask && (output->phy.in_port == 0xffff)) +			if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) +				goto nla_put_failure; +	} else { +		u16 upper_u16; +		upper_u16 = !is_mask ? 0 : 0xffff; + +		if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, +				(upper_u16 << 16) | output->phy.in_port)) +			goto nla_put_failure; +	} + +	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) +		goto nla_put_failure; + +	nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); +	if (!nla) +		goto nla_put_failure; + +	eth_key = nla_data(nla); +	ether_addr_copy(eth_key->eth_src, output->eth.src); +	ether_addr_copy(eth_key->eth_dst, output->eth.dst); + +	if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { +		__be16 eth_type; +		eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); +		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || +		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) +			goto nla_put_failure; +		encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); +		if (!swkey->eth.tci) +			goto unencap; +	} else +		encap = NULL; + +	if (swkey->eth.type == htons(ETH_P_802_2)) { +		/* +		 * Ethertype 802.2 is represented in the netlink with omitted +		 * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and +		 * 0xffff in the mask attribute.  Ethertype can also +		 * be wildcarded. +		 */ +		if (is_mask && output->eth.type) +			if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, +						output->eth.type)) +				goto nla_put_failure; +		goto unencap; +	} + +	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) +		goto nla_put_failure; + +	if (swkey->eth.type == htons(ETH_P_IP)) { +		struct ovs_key_ipv4 *ipv4_key; + +		nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); +		if (!nla) +			goto nla_put_failure; +		ipv4_key = nla_data(nla); +		ipv4_key->ipv4_src = output->ipv4.addr.src; +		ipv4_key->ipv4_dst = output->ipv4.addr.dst; +		ipv4_key->ipv4_proto = output->ip.proto; +		ipv4_key->ipv4_tos = output->ip.tos; +		ipv4_key->ipv4_ttl = output->ip.ttl; +		ipv4_key->ipv4_frag = output->ip.frag; +	} else if (swkey->eth.type == htons(ETH_P_IPV6)) { +		struct ovs_key_ipv6 *ipv6_key; + +		nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); +		if (!nla) +			goto nla_put_failure; +		ipv6_key = nla_data(nla); +		memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, +				sizeof(ipv6_key->ipv6_src)); +		memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, +				sizeof(ipv6_key->ipv6_dst)); +		ipv6_key->ipv6_label = output->ipv6.label; +		ipv6_key->ipv6_proto = output->ip.proto; +		ipv6_key->ipv6_tclass = output->ip.tos; +		ipv6_key->ipv6_hlimit = output->ip.ttl; +		ipv6_key->ipv6_frag = output->ip.frag; +	} else if (swkey->eth.type == htons(ETH_P_ARP) || +		   swkey->eth.type == htons(ETH_P_RARP)) { +		struct ovs_key_arp *arp_key; + +		nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); +		if (!nla) +			goto nla_put_failure; +		arp_key = nla_data(nla); +		memset(arp_key, 0, sizeof(struct ovs_key_arp)); +		arp_key->arp_sip = output->ipv4.addr.src; +		arp_key->arp_tip = output->ipv4.addr.dst; +		arp_key->arp_op = htons(output->ip.proto); +		ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); +		ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); +	} + +	if ((swkey->eth.type == htons(ETH_P_IP) || +	     swkey->eth.type == htons(ETH_P_IPV6)) && +	     swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + +		if (swkey->ip.proto == IPPROTO_TCP) { +			struct ovs_key_tcp *tcp_key; + +			nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); +			if (!nla) +				goto nla_put_failure; +			tcp_key = nla_data(nla); +			tcp_key->tcp_src = output->tp.src; +			tcp_key->tcp_dst = output->tp.dst; +			if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, +					 output->tp.flags)) +				goto nla_put_failure; +		} else if (swkey->ip.proto == IPPROTO_UDP) { +			struct ovs_key_udp *udp_key; + +			nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); +			if (!nla) +				goto nla_put_failure; +			udp_key = nla_data(nla); +			udp_key->udp_src = output->tp.src; +			udp_key->udp_dst = output->tp.dst; +		} else if (swkey->ip.proto == IPPROTO_SCTP) { +			struct ovs_key_sctp *sctp_key; + +			nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); +			if (!nla) +				goto nla_put_failure; +			sctp_key = nla_data(nla); +			sctp_key->sctp_src = output->tp.src; +			sctp_key->sctp_dst = output->tp.dst; +		} else if (swkey->eth.type == htons(ETH_P_IP) && +			   swkey->ip.proto == IPPROTO_ICMP) { +			struct ovs_key_icmp *icmp_key; + +			nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); +			if (!nla) +				goto nla_put_failure; +			icmp_key = nla_data(nla); +			icmp_key->icmp_type = ntohs(output->tp.src); +			icmp_key->icmp_code = ntohs(output->tp.dst); +		} else if (swkey->eth.type == htons(ETH_P_IPV6) && +			   swkey->ip.proto == IPPROTO_ICMPV6) { +			struct ovs_key_icmpv6 *icmpv6_key; + +			nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, +						sizeof(*icmpv6_key)); +			if (!nla) +				goto nla_put_failure; +			icmpv6_key = nla_data(nla); +			icmpv6_key->icmpv6_type = ntohs(output->tp.src); +			icmpv6_key->icmpv6_code = ntohs(output->tp.dst); + +			if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || +			    icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { +				struct ovs_key_nd *nd_key; + +				nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); +				if (!nla) +					goto nla_put_failure; +				nd_key = nla_data(nla); +				memcpy(nd_key->nd_target, &output->ipv6.nd.target, +							sizeof(nd_key->nd_target)); +				ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); +				ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); +			} +		} +	} + +unencap: +	if (encap) +		nla_nest_end(skb, encap); + +	return 0; + +nla_put_failure: +	return -EMSGSIZE; +} + +#define MAX_ACTIONS_BUFSIZE	(32 * 1024) + +struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) +{ +	struct sw_flow_actions *sfa; + +	if (size > MAX_ACTIONS_BUFSIZE) +		return ERR_PTR(-EINVAL); + +	sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); +	if (!sfa) +		return ERR_PTR(-ENOMEM); + +	sfa->actions_len = 0; +	return sfa; +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ +	kfree_rcu(sf_acts, rcu); +} + +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, +				       int attr_len) +{ + +	struct sw_flow_actions *acts; +	int new_acts_size; +	int req_size = NLA_ALIGN(attr_len); +	int next_offset = offsetof(struct sw_flow_actions, actions) + +					(*sfa)->actions_len; + +	if (req_size <= (ksize(*sfa) - next_offset)) +		goto out; + +	new_acts_size = ksize(*sfa) * 2; + +	if (new_acts_size > MAX_ACTIONS_BUFSIZE) { +		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) +			return ERR_PTR(-EMSGSIZE); +		new_acts_size = MAX_ACTIONS_BUFSIZE; +	} + +	acts = ovs_nla_alloc_flow_actions(new_acts_size); +	if (IS_ERR(acts)) +		return (void *)acts; + +	memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); +	acts->actions_len = (*sfa)->actions_len; +	kfree(*sfa); +	*sfa = acts; + +out: +	(*sfa)->actions_len += req_size; +	return  (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +{ +	struct nlattr *a; + +	a = reserve_sfa_size(sfa, nla_attr_size(len)); +	if (IS_ERR(a)) +		return PTR_ERR(a); + +	a->nla_type = attrtype; +	a->nla_len = nla_attr_size(len); + +	if (data) +		memcpy(nla_data(a), data, len); +	memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + +	return 0; +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, +					  int attrtype) +{ +	int used = (*sfa)->actions_len; +	int err; + +	err = add_action(sfa, attrtype, NULL, 0); +	if (err) +		return err; + +	return used; +} + +static inline void add_nested_action_end(struct sw_flow_actions *sfa, +					 int st_offset) +{ +	struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + +							       st_offset); + +	a->nla_len = sfa->actions_len - st_offset; +} + +static int validate_and_copy_sample(const struct nlattr *attr, +				    const struct sw_flow_key *key, int depth, +				    struct sw_flow_actions **sfa) +{ +	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; +	const struct nlattr *probability, *actions; +	const struct nlattr *a; +	int rem, start, err, st_acts; + +	memset(attrs, 0, sizeof(attrs)); +	nla_for_each_nested(a, attr, rem) { +		int type = nla_type(a); +		if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) +			return -EINVAL; +		attrs[type] = a; +	} +	if (rem) +		return -EINVAL; + +	probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; +	if (!probability || nla_len(probability) != sizeof(u32)) +		return -EINVAL; + +	actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; +	if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) +		return -EINVAL; + +	/* validation done, copy sample action. */ +	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); +	if (start < 0) +		return start; +	err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, +			 nla_data(probability), sizeof(u32)); +	if (err) +		return err; +	st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); +	if (st_acts < 0) +		return st_acts; + +	err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); +	if (err) +		return err; + +	add_nested_action_end(*sfa, st_acts); +	add_nested_action_end(*sfa, start); + +	return 0; +} + +static int validate_tp_port(const struct sw_flow_key *flow_key) +{ +	if ((flow_key->eth.type == htons(ETH_P_IP) || +	     flow_key->eth.type == htons(ETH_P_IPV6)) && +	    (flow_key->tp.src || flow_key->tp.dst)) +		return 0; + +	return -EINVAL; +} + +void ovs_match_init(struct sw_flow_match *match, +		    struct sw_flow_key *key, +		    struct sw_flow_mask *mask) +{ +	memset(match, 0, sizeof(*match)); +	match->key = key; +	match->mask = mask; + +	memset(key, 0, sizeof(*key)); + +	if (mask) { +		memset(&mask->key, 0, sizeof(mask->key)); +		mask->range.start = mask->range.end = 0; +	} +} + +static int validate_and_copy_set_tun(const struct nlattr *attr, +				     struct sw_flow_actions **sfa) +{ +	struct sw_flow_match match; +	struct sw_flow_key key; +	int err, start; + +	ovs_match_init(&match, &key, NULL); +	err = ipv4_tun_from_nlattr(nla_data(attr), &match, false); +	if (err) +		return err; + +	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); +	if (start < 0) +		return start; + +	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, +			sizeof(match.key->tun_key)); +	add_nested_action_end(*sfa, start); + +	return err; +} + +static int validate_set(const struct nlattr *a, +			const struct sw_flow_key *flow_key, +			struct sw_flow_actions **sfa, +			bool *set_tun) +{ +	const struct nlattr *ovs_key = nla_data(a); +	int key_type = nla_type(ovs_key); + +	/* There can be only one key in a action */ +	if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) +		return -EINVAL; + +	if (key_type > OVS_KEY_ATTR_MAX || +	    (ovs_key_lens[key_type] != nla_len(ovs_key) && +	     ovs_key_lens[key_type] != -1)) +		return -EINVAL; + +	switch (key_type) { +	const struct ovs_key_ipv4 *ipv4_key; +	const struct ovs_key_ipv6 *ipv6_key; +	int err; + +	case OVS_KEY_ATTR_PRIORITY: +	case OVS_KEY_ATTR_SKB_MARK: +	case OVS_KEY_ATTR_ETHERNET: +		break; + +	case OVS_KEY_ATTR_TUNNEL: +		*set_tun = true; +		err = validate_and_copy_set_tun(a, sfa); +		if (err) +			return err; +		break; + +	case OVS_KEY_ATTR_IPV4: +		if (flow_key->eth.type != htons(ETH_P_IP)) +			return -EINVAL; + +		if (!flow_key->ip.proto) +			return -EINVAL; + +		ipv4_key = nla_data(ovs_key); +		if (ipv4_key->ipv4_proto != flow_key->ip.proto) +			return -EINVAL; + +		if (ipv4_key->ipv4_frag != flow_key->ip.frag) +			return -EINVAL; + +		break; + +	case OVS_KEY_ATTR_IPV6: +		if (flow_key->eth.type != htons(ETH_P_IPV6)) +			return -EINVAL; + +		if (!flow_key->ip.proto) +			return -EINVAL; + +		ipv6_key = nla_data(ovs_key); +		if (ipv6_key->ipv6_proto != flow_key->ip.proto) +			return -EINVAL; + +		if (ipv6_key->ipv6_frag != flow_key->ip.frag) +			return -EINVAL; + +		if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) +			return -EINVAL; + +		break; + +	case OVS_KEY_ATTR_TCP: +		if (flow_key->ip.proto != IPPROTO_TCP) +			return -EINVAL; + +		return validate_tp_port(flow_key); + +	case OVS_KEY_ATTR_UDP: +		if (flow_key->ip.proto != IPPROTO_UDP) +			return -EINVAL; + +		return validate_tp_port(flow_key); + +	case OVS_KEY_ATTR_SCTP: +		if (flow_key->ip.proto != IPPROTO_SCTP) +			return -EINVAL; + +		return validate_tp_port(flow_key); + +	default: +		return -EINVAL; +	} + +	return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ +	static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { +		[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, +		[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, +	}; +	struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; +	int error; + +	error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, +				 attr, userspace_policy); +	if (error) +		return error; + +	if (!a[OVS_USERSPACE_ATTR_PID] || +	    !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) +		return -EINVAL; + +	return 0; +} + +static int copy_action(const struct nlattr *from, +		       struct sw_flow_actions **sfa) +{ +	int totlen = NLA_ALIGN(from->nla_len); +	struct nlattr *to; + +	to = reserve_sfa_size(sfa, from->nla_len); +	if (IS_ERR(to)) +		return PTR_ERR(to); + +	memcpy(to, from, totlen); +	return 0; +} + +int ovs_nla_copy_actions(const struct nlattr *attr, +			 const struct sw_flow_key *key, +			 int depth, +			 struct sw_flow_actions **sfa) +{ +	const struct nlattr *a; +	int rem, err; + +	if (depth >= SAMPLE_ACTION_DEPTH) +		return -EOVERFLOW; + +	nla_for_each_nested(a, attr, rem) { +		/* Expected argument lengths, (u32)-1 for variable length. */ +		static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { +			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), +			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1, +			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), +			[OVS_ACTION_ATTR_POP_VLAN] = 0, +			[OVS_ACTION_ATTR_SET] = (u32)-1, +			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1 +		}; +		const struct ovs_action_push_vlan *vlan; +		int type = nla_type(a); +		bool skip_copy; + +		if (type > OVS_ACTION_ATTR_MAX || +		    (action_lens[type] != nla_len(a) && +		     action_lens[type] != (u32)-1)) +			return -EINVAL; + +		skip_copy = false; +		switch (type) { +		case OVS_ACTION_ATTR_UNSPEC: +			return -EINVAL; + +		case OVS_ACTION_ATTR_USERSPACE: +			err = validate_userspace(a); +			if (err) +				return err; +			break; + +		case OVS_ACTION_ATTR_OUTPUT: +			if (nla_get_u32(a) >= DP_MAX_PORTS) +				return -EINVAL; +			break; + + +		case OVS_ACTION_ATTR_POP_VLAN: +			break; + +		case OVS_ACTION_ATTR_PUSH_VLAN: +			vlan = nla_data(a); +			if (vlan->vlan_tpid != htons(ETH_P_8021Q)) +				return -EINVAL; +			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) +				return -EINVAL; +			break; + +		case OVS_ACTION_ATTR_SET: +			err = validate_set(a, key, sfa, &skip_copy); +			if (err) +				return err; +			break; + +		case OVS_ACTION_ATTR_SAMPLE: +			err = validate_and_copy_sample(a, key, depth, sfa); +			if (err) +				return err; +			skip_copy = true; +			break; + +		default: +			return -EINVAL; +		} +		if (!skip_copy) { +			err = copy_action(a, sfa); +			if (err) +				return err; +		} +	} + +	if (rem > 0) +		return -EINVAL; + +	return 0; +} + +static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +{ +	const struct nlattr *a; +	struct nlattr *start; +	int err = 0, rem; + +	start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); +	if (!start) +		return -EMSGSIZE; + +	nla_for_each_nested(a, attr, rem) { +		int type = nla_type(a); +		struct nlattr *st_sample; + +		switch (type) { +		case OVS_SAMPLE_ATTR_PROBABILITY: +			if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, +				    sizeof(u32), nla_data(a))) +				return -EMSGSIZE; +			break; +		case OVS_SAMPLE_ATTR_ACTIONS: +			st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); +			if (!st_sample) +				return -EMSGSIZE; +			err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); +			if (err) +				return err; +			nla_nest_end(skb, st_sample); +			break; +		} +	} + +	nla_nest_end(skb, start); +	return err; +} + +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ +	const struct nlattr *ovs_key = nla_data(a); +	int key_type = nla_type(ovs_key); +	struct nlattr *start; +	int err; + +	switch (key_type) { +	case OVS_KEY_ATTR_IPV4_TUNNEL: +		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); +		if (!start) +			return -EMSGSIZE; + +		err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), +					     nla_data(ovs_key)); +		if (err) +			return err; +		nla_nest_end(skb, start); +		break; +	default: +		if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) +			return -EMSGSIZE; +		break; +	} + +	return 0; +} + +int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) +{ +	const struct nlattr *a; +	int rem, err; + +	nla_for_each_attr(a, attr, len, rem) { +		int type = nla_type(a); + +		switch (type) { +		case OVS_ACTION_ATTR_SET: +			err = set_action_to_attr(a, skb); +			if (err) +				return err; +			break; + +		case OVS_ACTION_ATTR_SAMPLE: +			err = sample_action_to_attr(a, skb); +			if (err) +				return err; +			break; +		default: +			if (nla_put(skb, type, nla_len(a), nla_data(a))) +				return -EMSGSIZE; +			break; +		} +	} + +	return 0; +} diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h new file mode 100644 index 00000000000..440151045d3 --- /dev/null +++ b/net/openvswitch/flow_netlink.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + + +#ifndef FLOW_NETLINK_H +#define FLOW_NETLINK_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +void ovs_match_init(struct sw_flow_match *match, +		    struct sw_flow_key *key, struct sw_flow_mask *mask); + +int ovs_nla_put_flow(const struct sw_flow_key *, +		     const struct sw_flow_key *, struct sk_buff *); +int ovs_nla_get_flow_metadata(struct sw_flow *flow, +			      const struct nlattr *attr); +int ovs_nla_get_match(struct sw_flow_match *match, +		      const struct nlattr *, +		      const struct nlattr *); + +int ovs_nla_copy_actions(const struct nlattr *attr, +			 const struct sw_flow_key *key, int depth, +			 struct sw_flow_actions **sfa); +int ovs_nla_put_actions(const struct nlattr *attr, +			int len, struct sk_buff *skb); + +struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len); +void ovs_nla_free_flow_actions(struct sw_flow_actions *); + +#endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c new file mode 100644 index 00000000000..cf2d853646f --- /dev/null +++ b/net/openvswitch/flow_table.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +#define TBL_MIN_BUCKETS		1024 +#define REHASH_INTERVAL		(10 * 60 * HZ) + +static struct kmem_cache *flow_cache; +struct kmem_cache *flow_stats_cache __read_mostly; + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ +	return range->end - range->start; +} + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, +		       const struct sw_flow_mask *mask) +{ +	const long *m = (const long *)((const u8 *)&mask->key + +				mask->range.start); +	const long *s = (const long *)((const u8 *)src + +				mask->range.start); +	long *d = (long *)((u8 *)dst + mask->range.start); +	int i; + +	/* The memory outside of the 'mask->range' are not set since +	 * further operations on 'dst' only uses contents within +	 * 'mask->range'. +	 */ +	for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) +		*d++ = *s++ & *m++; +} + +struct sw_flow *ovs_flow_alloc(void) +{ +	struct sw_flow *flow; +	struct flow_stats *stats; +	int node; + +	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); +	if (!flow) +		return ERR_PTR(-ENOMEM); + +	flow->sf_acts = NULL; +	flow->mask = NULL; +	flow->stats_last_writer = NUMA_NO_NODE; + +	/* Initialize the default stat node. */ +	stats = kmem_cache_alloc_node(flow_stats_cache, +				      GFP_KERNEL | __GFP_ZERO, 0); +	if (!stats) +		goto err; + +	spin_lock_init(&stats->lock); + +	RCU_INIT_POINTER(flow->stats[0], stats); + +	for_each_node(node) +		if (node != 0) +			RCU_INIT_POINTER(flow->stats[node], NULL); + +	return flow; +err: +	kmem_cache_free(flow_cache, flow); +	return ERR_PTR(-ENOMEM); +} + +int ovs_flow_tbl_count(struct flow_table *table) +{ +	return table->count; +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ +	struct flex_array *buckets; +	int i, err; + +	buckets = flex_array_alloc(sizeof(struct hlist_head), +				   n_buckets, GFP_KERNEL); +	if (!buckets) +		return NULL; + +	err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); +	if (err) { +		flex_array_free(buckets); +		return NULL; +	} + +	for (i = 0; i < n_buckets; i++) +		INIT_HLIST_HEAD((struct hlist_head *) +					flex_array_get(buckets, i)); + +	return buckets; +} + +static void flow_free(struct sw_flow *flow) +{ +	int node; + +	kfree((struct sw_flow_actions __force *)flow->sf_acts); +	for_each_node(node) +		if (flow->stats[node]) +			kmem_cache_free(flow_stats_cache, +					(struct flow_stats __force *)flow->stats[node]); +	kmem_cache_free(flow_cache, flow); +} + +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ +	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + +	flow_free(flow); +} + +void ovs_flow_free(struct sw_flow *flow, bool deferred) +{ +	if (!flow) +		return; + +	if (deferred) +		call_rcu(&flow->rcu, rcu_free_flow_callback); +	else +		flow_free(flow); +} + +static void free_buckets(struct flex_array *buckets) +{ +	flex_array_free(buckets); +} + + +static void __table_instance_destroy(struct table_instance *ti) +{ +	free_buckets(ti->buckets); +	kfree(ti); +} + +static struct table_instance *table_instance_alloc(int new_size) +{ +	struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + +	if (!ti) +		return NULL; + +	ti->buckets = alloc_buckets(new_size); + +	if (!ti->buckets) { +		kfree(ti); +		return NULL; +	} +	ti->n_buckets = new_size; +	ti->node_ver = 0; +	ti->keep_flows = false; +	get_random_bytes(&ti->hash_seed, sizeof(u32)); + +	return ti; +} + +int ovs_flow_tbl_init(struct flow_table *table) +{ +	struct table_instance *ti; + +	ti = table_instance_alloc(TBL_MIN_BUCKETS); + +	if (!ti) +		return -ENOMEM; + +	rcu_assign_pointer(table->ti, ti); +	INIT_LIST_HEAD(&table->mask_list); +	table->last_rehash = jiffies; +	table->count = 0; +	return 0; +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ +	struct table_instance *ti = container_of(rcu, struct table_instance, rcu); + +	__table_instance_destroy(ti); +} + +static void table_instance_destroy(struct table_instance *ti, bool deferred) +{ +	int i; + +	if (!ti) +		return; + +	if (ti->keep_flows) +		goto skip_flows; + +	for (i = 0; i < ti->n_buckets; i++) { +		struct sw_flow *flow; +		struct hlist_head *head = flex_array_get(ti->buckets, i); +		struct hlist_node *n; +		int ver = ti->node_ver; + +		hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { +			hlist_del_rcu(&flow->hash_node[ver]); +			ovs_flow_free(flow, deferred); +		} +	} + +skip_flows: +	if (deferred) +		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); +	else +		__table_instance_destroy(ti); +} + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +{ +	struct table_instance *ti = ovsl_dereference(table->ti); + +	table_instance_destroy(ti, deferred); +} + +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, +				       u32 *bucket, u32 *last) +{ +	struct sw_flow *flow; +	struct hlist_head *head; +	int ver; +	int i; + +	ver = ti->node_ver; +	while (*bucket < ti->n_buckets) { +		i = 0; +		head = flex_array_get(ti->buckets, *bucket); +		hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { +			if (i < *last) { +				i++; +				continue; +			} +			*last = i + 1; +			return flow; +		} +		(*bucket)++; +		*last = 0; +	} + +	return NULL; +} + +static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) +{ +	hash = jhash_1word(hash, ti->hash_seed); +	return flex_array_get(ti->buckets, +				(hash & (ti->n_buckets - 1))); +} + +static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow) +{ +	struct hlist_head *head; + +	head = find_bucket(ti, flow->hash); +	hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head); +} + +static void flow_table_copy_flows(struct table_instance *old, +				  struct table_instance *new) +{ +	int old_ver; +	int i; + +	old_ver = old->node_ver; +	new->node_ver = !old_ver; + +	/* Insert in new table. */ +	for (i = 0; i < old->n_buckets; i++) { +		struct sw_flow *flow; +		struct hlist_head *head; + +		head = flex_array_get(old->buckets, i); + +		hlist_for_each_entry(flow, head, hash_node[old_ver]) +			table_instance_insert(new, flow); +	} + +	old->keep_flows = true; +} + +static struct table_instance *table_instance_rehash(struct table_instance *ti, +					    int n_buckets) +{ +	struct table_instance *new_ti; + +	new_ti = table_instance_alloc(n_buckets); +	if (!new_ti) +		return NULL; + +	flow_table_copy_flows(ti, new_ti); + +	return new_ti; +} + +int ovs_flow_tbl_flush(struct flow_table *flow_table) +{ +	struct table_instance *old_ti; +	struct table_instance *new_ti; + +	old_ti = ovsl_dereference(flow_table->ti); +	new_ti = table_instance_alloc(TBL_MIN_BUCKETS); +	if (!new_ti) +		return -ENOMEM; + +	rcu_assign_pointer(flow_table->ti, new_ti); +	flow_table->last_rehash = jiffies; +	flow_table->count = 0; + +	table_instance_destroy(old_ti, true); +	return 0; +} + +static u32 flow_hash(const struct sw_flow_key *key, int key_start, +		     int key_end) +{ +	const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); +	int hash_u32s = (key_end - key_start) >> 2; + +	/* Make sure number of hash bytes are multiple of u32. */ +	BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + +	return arch_fast_hash2(hash_key, hash_u32s, 0); +} + +static int flow_key_start(const struct sw_flow_key *key) +{ +	if (key->tun_key.ipv4_dst) +		return 0; +	else +		return rounddown(offsetof(struct sw_flow_key, phy), +					  sizeof(long)); +} + +static bool cmp_key(const struct sw_flow_key *key1, +		    const struct sw_flow_key *key2, +		    int key_start, int key_end) +{ +	const long *cp1 = (const long *)((const u8 *)key1 + key_start); +	const long *cp2 = (const long *)((const u8 *)key2 + key_start); +	long diffs = 0; +	int i; + +	for (i = key_start; i < key_end;  i += sizeof(long)) +		diffs |= *cp1++ ^ *cp2++; + +	return diffs == 0; +} + +static bool flow_cmp_masked_key(const struct sw_flow *flow, +				const struct sw_flow_key *key, +				int key_start, int key_end) +{ +	return cmp_key(&flow->key, key, key_start, key_end); +} + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, +			       struct sw_flow_match *match) +{ +	struct sw_flow_key *key = match->key; +	int key_start = flow_key_start(key); +	int key_end = match->range.end; + +	return cmp_key(&flow->unmasked_key, key, key_start, key_end); +} + +static struct sw_flow *masked_flow_lookup(struct table_instance *ti, +					  const struct sw_flow_key *unmasked, +					  struct sw_flow_mask *mask) +{ +	struct sw_flow *flow; +	struct hlist_head *head; +	int key_start = mask->range.start; +	int key_end = mask->range.end; +	u32 hash; +	struct sw_flow_key masked_key; + +	ovs_flow_mask_key(&masked_key, unmasked, mask); +	hash = flow_hash(&masked_key, key_start, key_end); +	head = find_bucket(ti, hash); +	hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) { +		if (flow->mask == mask && flow->hash == hash && +		    flow_cmp_masked_key(flow, &masked_key, +					  key_start, key_end)) +			return flow; +	} +	return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, +				    const struct sw_flow_key *key, +				    u32 *n_mask_hit) +{ +	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); +	struct sw_flow_mask *mask; +	struct sw_flow *flow; + +	*n_mask_hit = 0; +	list_for_each_entry_rcu(mask, &tbl->mask_list, list) { +		(*n_mask_hit)++; +		flow = masked_flow_lookup(ti, key, mask); +		if (flow)  /* Found */ +			return flow; +	} +	return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, +				    const struct sw_flow_key *key) +{ +	u32 __always_unused n_mask_hit; + +	return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); +} + +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, +					  struct sw_flow_match *match) +{ +	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); +	struct sw_flow_mask *mask; +	struct sw_flow *flow; + +	/* Always called under ovs-mutex. */ +	list_for_each_entry(mask, &tbl->mask_list, list) { +		flow = masked_flow_lookup(ti, match->key, mask); +		if (flow && ovs_flow_cmp_unmasked_key(flow, match))  /* Found */ +			return flow; +	} +	return NULL; +} + +int ovs_flow_tbl_num_masks(const struct flow_table *table) +{ +	struct sw_flow_mask *mask; +	int num = 0; + +	list_for_each_entry(mask, &table->mask_list, list) +		num++; + +	return num; +} + +static struct table_instance *table_instance_expand(struct table_instance *ti) +{ +	return table_instance_rehash(ti, ti->n_buckets * 2); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ +	if (mask) { +		/* ovs-lock is required to protect mask-refcount and +		 * mask list. +		 */ +		ASSERT_OVSL(); +		BUG_ON(!mask->ref_count); +		mask->ref_count--; + +		if (!mask->ref_count) { +			list_del_rcu(&mask->list); +			kfree_rcu(mask, rcu); +		} +	} +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ +	struct table_instance *ti = ovsl_dereference(table->ti); + +	BUG_ON(table->count == 0); +	hlist_del_rcu(&flow->hash_node[ti->node_ver]); +	table->count--; + +	/* RCU delete the mask. 'flow->mask' is not NULLed, as it should be +	 * accessible as long as the RCU read lock is held. +	 */ +	flow_mask_remove(table, flow->mask); +} + +static struct sw_flow_mask *mask_alloc(void) +{ +	struct sw_flow_mask *mask; + +	mask = kmalloc(sizeof(*mask), GFP_KERNEL); +	if (mask) +		mask->ref_count = 1; + +	return mask; +} + +static bool mask_equal(const struct sw_flow_mask *a, +		       const struct sw_flow_mask *b) +{ +	const u8 *a_ = (const u8 *)&a->key + a->range.start; +	const u8 *b_ = (const u8 *)&b->key + b->range.start; + +	return  (a->range.end == b->range.end) +		&& (a->range.start == b->range.start) +		&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, +					   const struct sw_flow_mask *mask) +{ +	struct list_head *ml; + +	list_for_each(ml, &tbl->mask_list) { +		struct sw_flow_mask *m; +		m = container_of(ml, struct sw_flow_mask, list); +		if (mask_equal(mask, m)) +			return m; +	} + +	return NULL; +} + +/* Add 'mask' into the mask list, if it is not already there. */ +static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, +			    struct sw_flow_mask *new) +{ +	struct sw_flow_mask *mask; +	mask = flow_mask_find(tbl, new); +	if (!mask) { +		/* Allocate a new mask if none exsits. */ +		mask = mask_alloc(); +		if (!mask) +			return -ENOMEM; +		mask->key = new->key; +		mask->range = new->range; +		list_add_rcu(&mask->list, &tbl->mask_list); +	} else { +		BUG_ON(!mask->ref_count); +		mask->ref_count++; +	} + +	flow->mask = mask; +	return 0; +} + +/* Must be called with OVS mutex held. */ +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, +			struct sw_flow_mask *mask) +{ +	struct table_instance *new_ti = NULL; +	struct table_instance *ti; +	int err; + +	err = flow_mask_insert(table, flow, mask); +	if (err) +		return err; + +	flow->hash = flow_hash(&flow->key, flow->mask->range.start, +			flow->mask->range.end); +	ti = ovsl_dereference(table->ti); +	table_instance_insert(ti, flow); +	table->count++; + +	/* Expand table, if necessary, to make room. */ +	if (table->count > ti->n_buckets) +		new_ti = table_instance_expand(ti); +	else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) +		new_ti = table_instance_rehash(ti, ti->n_buckets); + +	if (new_ti) { +		rcu_assign_pointer(table->ti, new_ti); +		table_instance_destroy(ti, true); +		table->last_rehash = jiffies; +	} +	return 0; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ +	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); +	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + +	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) +				       + (num_possible_nodes() +					  * sizeof(struct flow_stats *)), +				       0, 0, NULL); +	if (flow_cache == NULL) +		return -ENOMEM; + +	flow_stats_cache +		= kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats), +				    0, SLAB_HWCACHE_ALIGN, NULL); +	if (flow_stats_cache == NULL) { +		kmem_cache_destroy(flow_cache); +		flow_cache = NULL; +		return -ENOMEM; +	} + +	return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ +	kmem_cache_destroy(flow_stats_cache); +	kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h new file mode 100644 index 00000000000..5918bff7f3f --- /dev/null +++ b/net/openvswitch/flow_table.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +struct table_instance { +	struct flex_array *buckets; +	unsigned int n_buckets; +	struct rcu_head rcu; +	int node_ver; +	u32 hash_seed; +	bool keep_flows; +}; + +struct flow_table { +	struct table_instance __rcu *ti; +	struct list_head mask_list; +	unsigned long last_rehash; +	unsigned int count; +}; + +extern struct kmem_cache *flow_stats_cache; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_free(struct sw_flow *, bool deferred); + +int ovs_flow_tbl_init(struct flow_table *); +int ovs_flow_tbl_count(struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +int ovs_flow_tbl_flush(struct flow_table *flow_table); + +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, +			struct sw_flow_mask *mask); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +int  ovs_flow_tbl_num_masks(const struct flow_table *table); +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, +				       u32 *bucket, u32 *idx); +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, +				    const struct sw_flow_key *, +				    u32 *n_mask_hit); +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, +				    const struct sw_flow_key *); +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, +					  struct sw_flow_match *match); +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, +			       struct sw_flow_match *match); + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, +		       const struct sw_flow_mask *mask); +#endif /* flow_table.h */ diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c new file mode 100644 index 00000000000..f49148a07da --- /dev/null +++ b/net/openvswitch/vport-gre.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/if_tunnel.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/in_route.h> +#include <linux/inetdevice.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/rculist.h> +#include <net/route.h> +#include <net/xfrm.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/gre.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/protocol.h> + +#include "datapath.h" +#include "vport.h" + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 be64_get_low32(__be64 x) +{ +#ifdef __BIG_ENDIAN +	return (__force __be32)x; +#else +	return (__force __be32)((__force u64)x >> 32); +#endif +} + +static __be16 filter_tnl_flags(__be16 flags) +{ +	return flags & (TUNNEL_CSUM | TUNNEL_KEY); +} + +static struct sk_buff *__build_header(struct sk_buff *skb, +				      int tunnel_hlen) +{ +	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; +	struct tnl_ptk_info tpi; + +	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); +	if (IS_ERR(skb)) +		return NULL; + +	tpi.flags = filter_tnl_flags(tun_key->tun_flags); +	tpi.proto = htons(ETH_P_TEB); +	tpi.key = be64_get_low32(tun_key->tun_id); +	tpi.seq = 0; +	gre_build_header(skb, &tpi, tunnel_hlen); + +	return skb; +} + +static __be64 key_to_tunnel_id(__be32 key, __be32 seq) +{ +#ifdef __BIG_ENDIAN +	return (__force __be64)((__force u64)seq << 32 | (__force u32)key); +#else +	return (__force __be64)((__force u64)key << 32 | (__force u32)seq); +#endif +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_rcv(struct sk_buff *skb, +		   const struct tnl_ptk_info *tpi) +{ +	struct ovs_key_ipv4_tunnel tun_key; +	struct ovs_net *ovs_net; +	struct vport *vport; +	__be64 key; + +	ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); +	vport = rcu_dereference(ovs_net->vport_net.gre_vport); +	if (unlikely(!vport)) +		return PACKET_REJECT; + +	key = key_to_tunnel_id(tpi->key, tpi->seq); +	ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, +			      filter_tnl_flags(tpi->flags)); + +	ovs_vport_receive(vport, skb, &tun_key); +	return PACKET_RCVD; +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, +		   const struct tnl_ptk_info *tpi) +{ +	struct ovs_net *ovs_net; +	struct vport *vport; + +	ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); +	vport = rcu_dereference(ovs_net->vport_net.gre_vport); + +	if (unlikely(!vport)) +		return PACKET_REJECT; +	else +		return PACKET_RCVD; +} + +static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) +{ +	struct net *net = ovs_dp_get_net(vport->dp); +	struct flowi4 fl; +	struct rtable *rt; +	int min_headroom; +	int tunnel_hlen; +	__be16 df; +	int err; + +	if (unlikely(!OVS_CB(skb)->tun_key)) { +		err = -EINVAL; +		goto error; +	} + +	/* Route lookup */ +	memset(&fl, 0, sizeof(fl)); +	fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; +	fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; +	fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); +	fl.flowi4_mark = skb->mark; +	fl.flowi4_proto = IPPROTO_GRE; + +	rt = ip_route_output_key(net, &fl); +	if (IS_ERR(rt)) +		return PTR_ERR(rt); + +	tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); + +	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len +			+ tunnel_hlen + sizeof(struct iphdr) +			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); +	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { +		int head_delta = SKB_DATA_ALIGN(min_headroom - +						skb_headroom(skb) + +						16); +		err = pskb_expand_head(skb, max_t(int, head_delta, 0), +					0, GFP_ATOMIC); +		if (unlikely(err)) +			goto err_free_rt; +	} + +	if (vlan_tx_tag_present(skb)) { +		if (unlikely(!__vlan_put_tag(skb, +					     skb->vlan_proto, +					     vlan_tx_tag_get(skb)))) { +			err = -ENOMEM; +			goto err_free_rt; +		} +		skb->vlan_tci = 0; +	} + +	/* Push Tunnel header. */ +	skb = __build_header(skb, tunnel_hlen); +	if (unlikely(!skb)) { +		err = 0; +		goto err_free_rt; +	} + +	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? +		htons(IP_DF) : 0; + +	skb->ignore_df = 1; + +	return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, +			     OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, +			     OVS_CB(skb)->tun_key->ipv4_tos, +			     OVS_CB(skb)->tun_key->ipv4_ttl, df, false); +err_free_rt: +	ip_rt_put(rt); +error: +	return err; +} + +static struct gre_cisco_protocol gre_protocol = { +	.handler        = gre_rcv, +	.err_handler    = gre_err, +	.priority       = 1, +}; + +static int gre_ports; +static int gre_init(void) +{ +	int err; + +	gre_ports++; +	if (gre_ports > 1) +		return 0; + +	err = gre_cisco_register(&gre_protocol); +	if (err) +		pr_warn("cannot register gre protocol handler\n"); + +	return err; +} + +static void gre_exit(void) +{ +	gre_ports--; +	if (gre_ports > 0) +		return; + +	gre_cisco_unregister(&gre_protocol); +} + +static const char *gre_get_name(const struct vport *vport) +{ +	return vport_priv(vport); +} + +static struct vport *gre_create(const struct vport_parms *parms) +{ +	struct net *net = ovs_dp_get_net(parms->dp); +	struct ovs_net *ovs_net; +	struct vport *vport; +	int err; + +	err = gre_init(); +	if (err) +		return ERR_PTR(err); + +	ovs_net = net_generic(net, ovs_net_id); +	if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { +		vport = ERR_PTR(-EEXIST); +		goto error; +	} + +	vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); +	if (IS_ERR(vport)) +		goto error; + +	strncpy(vport_priv(vport), parms->name, IFNAMSIZ); +	rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); +	return vport; + +error: +	gre_exit(); +	return vport; +} + +static void gre_tnl_destroy(struct vport *vport) +{ +	struct net *net = ovs_dp_get_net(vport->dp); +	struct ovs_net *ovs_net; + +	ovs_net = net_generic(net, ovs_net_id); + +	RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); +	ovs_vport_deferred_free(vport); +	gre_exit(); +} + +const struct vport_ops ovs_gre_vport_ops = { +	.type		= OVS_VPORT_TYPE_GRE, +	.create		= gre_create, +	.destroy	= gre_tnl_destroy, +	.get_name	= gre_get_name, +	.send		= gre_tnl_send, +}; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c new file mode 100644 index 00000000000..789af9280e7 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/hardirq.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/skbuff.h> + +#include <net/dst.h> +#include <net/xfrm.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +struct internal_dev { +	struct vport *vport; +}; + +static struct internal_dev *internal_dev_priv(struct net_device *netdev) +{ +	return netdev_priv(netdev); +} + +/* This function is only called by the kernel network layer.*/ +static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, +							struct rtnl_link_stats64 *stats) +{ +	struct vport *vport = ovs_internal_dev_get_vport(netdev); +	struct ovs_vport_stats vport_stats; + +	ovs_vport_get_stats(vport, &vport_stats); + +	/* The tx and rx stats need to be swapped because the +	 * switch and host OS have opposite perspectives. */ +	stats->rx_packets	= vport_stats.tx_packets; +	stats->tx_packets	= vport_stats.rx_packets; +	stats->rx_bytes		= vport_stats.tx_bytes; +	stats->tx_bytes		= vport_stats.rx_bytes; +	stats->rx_errors	= vport_stats.tx_errors; +	stats->tx_errors	= vport_stats.rx_errors; +	stats->rx_dropped	= vport_stats.tx_dropped; +	stats->tx_dropped	= vport_stats.rx_dropped; + +	return stats; +} + +/* Called with rcu_read_lock_bh. */ +static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +{ +	rcu_read_lock(); +	ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); +	rcu_read_unlock(); +	return 0; +} + +static int internal_dev_open(struct net_device *netdev) +{ +	netif_start_queue(netdev); +	return 0; +} + +static int internal_dev_stop(struct net_device *netdev) +{ +	netif_stop_queue(netdev); +	return 0; +} + +static void internal_dev_getinfo(struct net_device *netdev, +				 struct ethtool_drvinfo *info) +{ +	strlcpy(info->driver, "openvswitch", sizeof(info->driver)); +} + +static const struct ethtool_ops internal_dev_ethtool_ops = { +	.get_drvinfo	= internal_dev_getinfo, +	.get_link	= ethtool_op_get_link, +}; + +static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) +{ +	if (new_mtu < 68) +		return -EINVAL; + +	netdev->mtu = new_mtu; +	return 0; +} + +static void internal_dev_destructor(struct net_device *dev) +{ +	struct vport *vport = ovs_internal_dev_get_vport(dev); + +	ovs_vport_free(vport); +	free_netdev(dev); +} + +static const struct net_device_ops internal_dev_netdev_ops = { +	.ndo_open = internal_dev_open, +	.ndo_stop = internal_dev_stop, +	.ndo_start_xmit = internal_dev_xmit, +	.ndo_set_mac_address = eth_mac_addr, +	.ndo_change_mtu = internal_dev_change_mtu, +	.ndo_get_stats64 = internal_dev_get_stats, +}; + +static void do_setup(struct net_device *netdev) +{ +	ether_setup(netdev); + +	netdev->netdev_ops = &internal_dev_netdev_ops; + +	netdev->priv_flags &= ~IFF_TX_SKB_SHARING; +	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; +	netdev->destructor = internal_dev_destructor; +	netdev->ethtool_ops = &internal_dev_ethtool_ops; +	netdev->tx_queue_len = 0; + +	netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | +			   NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + +	netdev->vlan_features = netdev->features; +	netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; +	netdev->hw_features = netdev->features & ~NETIF_F_LLTX; +	eth_hw_addr_random(netdev); +} + +static struct vport *internal_dev_create(const struct vport_parms *parms) +{ +	struct vport *vport; +	struct netdev_vport *netdev_vport; +	struct internal_dev *internal_dev; +	int err; + +	vport = ovs_vport_alloc(sizeof(struct netdev_vport), +				&ovs_internal_vport_ops, parms); +	if (IS_ERR(vport)) { +		err = PTR_ERR(vport); +		goto error; +	} + +	netdev_vport = netdev_vport_priv(vport); + +	netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), +					 parms->name, do_setup); +	if (!netdev_vport->dev) { +		err = -ENOMEM; +		goto error_free_vport; +	} + +	dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); +	internal_dev = internal_dev_priv(netdev_vport->dev); +	internal_dev->vport = vport; + +	/* Restrict bridge port to current netns. */ +	if (vport->port_no == OVSP_LOCAL) +		netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + +	rtnl_lock(); +	err = register_netdevice(netdev_vport->dev); +	if (err) +		goto error_free_netdev; + +	dev_set_promiscuity(netdev_vport->dev, 1); +	rtnl_unlock(); +	netif_start_queue(netdev_vport->dev); + +	return vport; + +error_free_netdev: +	rtnl_unlock(); +	free_netdev(netdev_vport->dev); +error_free_vport: +	ovs_vport_free(vport); +error: +	return ERR_PTR(err); +} + +static void internal_dev_destroy(struct vport *vport) +{ +	struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + +	netif_stop_queue(netdev_vport->dev); +	rtnl_lock(); +	dev_set_promiscuity(netdev_vport->dev, -1); + +	/* unregister_netdevice() waits for an RCU grace period. */ +	unregister_netdevice(netdev_vport->dev); + +	rtnl_unlock(); +} + +static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +{ +	struct net_device *netdev = netdev_vport_priv(vport)->dev; +	int len; + +	len = skb->len; + +	skb_dst_drop(skb); +	nf_reset(skb); +	secpath_reset(skb); + +	skb->dev = netdev; +	skb->pkt_type = PACKET_HOST; +	skb->protocol = eth_type_trans(skb, netdev); +	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + +	netif_rx(skb); + +	return len; +} + +const struct vport_ops ovs_internal_vport_ops = { +	.type		= OVS_VPORT_TYPE_INTERNAL, +	.create		= internal_dev_create, +	.destroy	= internal_dev_destroy, +	.get_name	= ovs_netdev_get_name, +	.send		= internal_dev_recv, +}; + +int ovs_is_internal_dev(const struct net_device *netdev) +{ +	return netdev->netdev_ops == &internal_dev_netdev_ops; +} + +struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) +{ +	if (!ovs_is_internal_dev(netdev)) +		return NULL; + +	return internal_dev_priv(netdev)->vport; +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h new file mode 100644 index 00000000000..9a7d30ecc6a --- /dev/null +++ b/net/openvswitch/vport-internal_dev.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_INTERNAL_DEV_H +#define VPORT_INTERNAL_DEV_H 1 + +#include "datapath.h" +#include "vport.h" + +int ovs_is_internal_dev(const struct net_device *); +struct vport *ovs_internal_dev_get_vport(struct net_device *); + +#endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c new file mode 100644 index 00000000000..d21f77d875b --- /dev/null +++ b/net/openvswitch/vport-netdev.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if_arp.h> +#include <linux/if_bridge.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/llc.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/openvswitch.h> + +#include <net/llc.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +/* Must be called with rcu_read_lock. */ +static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +{ +	if (unlikely(!vport)) +		goto error; + +	if (unlikely(skb_warn_if_lro(skb))) +		goto error; + +	/* Make our own copy of the packet.  Otherwise we will mangle the +	 * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). +	 */ +	skb = skb_share_check(skb, GFP_ATOMIC); +	if (unlikely(!skb)) +		return; + +	skb_push(skb, ETH_HLEN); +	ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + +	ovs_vport_receive(vport, skb, NULL); +	return; + +error: +	kfree_skb(skb); +} + +/* Called with rcu_read_lock and bottom-halves disabled. */ +static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) +{ +	struct sk_buff *skb = *pskb; +	struct vport *vport; + +	if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) +		return RX_HANDLER_PASS; + +	vport = ovs_netdev_get_vport(skb->dev); + +	netdev_port_receive(vport, skb); + +	return RX_HANDLER_CONSUMED; +} + +static struct net_device *get_dpdev(struct datapath *dp) +{ +	struct vport *local; + +	local = ovs_vport_ovsl(dp, OVSP_LOCAL); +	BUG_ON(!local); +	return netdev_vport_priv(local)->dev; +} + +static struct vport *netdev_create(const struct vport_parms *parms) +{ +	struct vport *vport; +	struct netdev_vport *netdev_vport; +	int err; + +	vport = ovs_vport_alloc(sizeof(struct netdev_vport), +				&ovs_netdev_vport_ops, parms); +	if (IS_ERR(vport)) { +		err = PTR_ERR(vport); +		goto error; +	} + +	netdev_vport = netdev_vport_priv(vport); + +	netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); +	if (!netdev_vport->dev) { +		err = -ENODEV; +		goto error_free_vport; +	} + +	if (netdev_vport->dev->flags & IFF_LOOPBACK || +	    netdev_vport->dev->type != ARPHRD_ETHER || +	    ovs_is_internal_dev(netdev_vport->dev)) { +		err = -EINVAL; +		goto error_put; +	} + +	rtnl_lock(); +	err = netdev_master_upper_dev_link(netdev_vport->dev, +					   get_dpdev(vport->dp)); +	if (err) +		goto error_unlock; + +	err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, +					 vport); +	if (err) +		goto error_master_upper_dev_unlink; + +	dev_set_promiscuity(netdev_vport->dev, 1); +	netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; +	rtnl_unlock(); + +	return vport; + +error_master_upper_dev_unlink: +	netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); +error_unlock: +	rtnl_unlock(); +error_put: +	dev_put(netdev_vport->dev); +error_free_vport: +	ovs_vport_free(vport); +error: +	return ERR_PTR(err); +} + +static void free_port_rcu(struct rcu_head *rcu) +{ +	struct netdev_vport *netdev_vport = container_of(rcu, +					struct netdev_vport, rcu); + +	dev_put(netdev_vport->dev); +	ovs_vport_free(vport_from_priv(netdev_vport)); +} + +void ovs_netdev_detach_dev(struct vport *vport) +{ +	struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + +	ASSERT_RTNL(); +	netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; +	netdev_rx_handler_unregister(netdev_vport->dev); +	netdev_upper_dev_unlink(netdev_vport->dev, +				netdev_master_upper_dev_get(netdev_vport->dev)); +	dev_set_promiscuity(netdev_vport->dev, -1); +} + +static void netdev_destroy(struct vport *vport) +{ +	struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + +	rtnl_lock(); +	if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) +		ovs_netdev_detach_dev(vport); +	rtnl_unlock(); + +	call_rcu(&netdev_vport->rcu, free_port_rcu); +} + +const char *ovs_netdev_get_name(const struct vport *vport) +{ +	const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); +	return netdev_vport->dev->name; +} + +static unsigned int packet_length(const struct sk_buff *skb) +{ +	unsigned int length = skb->len - ETH_HLEN; + +	if (skb->protocol == htons(ETH_P_8021Q)) +		length -= VLAN_HLEN; + +	return length; +} + +static int netdev_send(struct vport *vport, struct sk_buff *skb) +{ +	struct netdev_vport *netdev_vport = netdev_vport_priv(vport); +	int mtu = netdev_vport->dev->mtu; +	int len; + +	if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { +		net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", +				     netdev_vport->dev->name, +				     packet_length(skb), mtu); +		goto drop; +	} + +	skb->dev = netdev_vport->dev; +	len = skb->len; +	dev_queue_xmit(skb); + +	return len; + +drop: +	kfree_skb(skb); +	return 0; +} + +/* Returns null if this device is not attached to a datapath. */ +struct vport *ovs_netdev_get_vport(struct net_device *dev) +{ +	if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) +		return (struct vport *) +			rcu_dereference_rtnl(dev->rx_handler_data); +	else +		return NULL; +} + +const struct vport_ops ovs_netdev_vport_ops = { +	.type		= OVS_VPORT_TYPE_NETDEV, +	.create		= netdev_create, +	.destroy	= netdev_destroy, +	.get_name	= ovs_netdev_get_name, +	.send		= netdev_send, +}; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h new file mode 100644 index 00000000000..8df01c1127e --- /dev/null +++ b/net/openvswitch/vport-netdev.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_NETDEV_H +#define VPORT_NETDEV_H 1 + +#include <linux/netdevice.h> +#include <linux/rcupdate.h> + +#include "vport.h" + +struct vport *ovs_netdev_get_vport(struct net_device *dev); + +struct netdev_vport { +	struct rcu_head rcu; + +	struct net_device *dev; +}; + +static inline struct netdev_vport * +netdev_vport_priv(const struct vport *vport) +{ +	return vport_priv(vport); +} + +const char *ovs_netdev_get_name(const struct vport *); +void ovs_netdev_detach_dev(struct vport *); + +#endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 00000000000..0edbd95c60e --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/rculist.h> +#include <linux/udp.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> +#include <net/route.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/vxlan.h> + +#include "datapath.h" +#include "vport.h" + +/** + * struct vxlan_port - Keeps track of open UDP ports + * @vs: vxlan_sock created for the port. + * @name: vport name. + */ +struct vxlan_port { +	struct vxlan_sock *vs; +	char name[IFNAMSIZ]; +}; + +static inline struct vxlan_port *vxlan_vport(const struct vport *vport) +{ +	return vport_priv(vport); +} + +/* Called with rcu_read_lock and BH disabled. */ +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) +{ +	struct ovs_key_ipv4_tunnel tun_key; +	struct vport *vport = vs->data; +	struct iphdr *iph; +	__be64 key; + +	/* Save outer tunnel values */ +	iph = ip_hdr(skb); +	key = cpu_to_be64(ntohl(vx_vni) >> 8); +	ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + +	ovs_vport_receive(vport, skb, &tun_key); +} + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ +	struct vxlan_port *vxlan_port = vxlan_vport(vport); +	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + +	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) +		return -EMSGSIZE; +	return 0; +} + +static void vxlan_tnl_destroy(struct vport *vport) +{ +	struct vxlan_port *vxlan_port = vxlan_vport(vport); + +	vxlan_sock_release(vxlan_port->vs); + +	ovs_vport_deferred_free(vport); +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ +	struct net *net = ovs_dp_get_net(parms->dp); +	struct nlattr *options = parms->options; +	struct vxlan_port *vxlan_port; +	struct vxlan_sock *vs; +	struct vport *vport; +	struct nlattr *a; +	u16 dst_port; +	int err; + +	if (!options) { +		err = -EINVAL; +		goto error; +	} +	a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); +	if (a && nla_len(a) == sizeof(u16)) { +		dst_port = nla_get_u16(a); +	} else { +		/* Require destination port from userspace. */ +		err = -EINVAL; +		goto error; +	} + +	vport = ovs_vport_alloc(sizeof(struct vxlan_port), +				&ovs_vxlan_vport_ops, parms); +	if (IS_ERR(vport)) +		return vport; + +	vxlan_port = vxlan_vport(vport); +	strncpy(vxlan_port->name, parms->name, IFNAMSIZ); + +	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0); +	if (IS_ERR(vs)) { +		ovs_vport_free(vport); +		return (void *)vs; +	} +	vxlan_port->vs = vs; + +	return vport; + +error: +	return ERR_PTR(err); +} + +static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) +{ +	struct net *net = ovs_dp_get_net(vport->dp); +	struct vxlan_port *vxlan_port = vxlan_vport(vport); +	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; +	struct rtable *rt; +	struct flowi4 fl; +	__be16 src_port; +	int port_min; +	int port_max; +	__be16 df; +	int err; + +	if (unlikely(!OVS_CB(skb)->tun_key)) { +		err = -EINVAL; +		goto error; +	} + +	/* Route lookup */ +	memset(&fl, 0, sizeof(fl)); +	fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; +	fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; +	fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); +	fl.flowi4_mark = skb->mark; +	fl.flowi4_proto = IPPROTO_UDP; + +	rt = ip_route_output_key(net, &fl); +	if (IS_ERR(rt)) { +		err = PTR_ERR(rt); +		goto error; +	} + +	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? +		htons(IP_DF) : 0; + +	skb->ignore_df = 1; + +	inet_get_local_port_range(net, &port_min, &port_max); +	src_port = vxlan_src_port(port_min, port_max, skb); + +	err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, +			     fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, +			     OVS_CB(skb)->tun_key->ipv4_tos, +			     OVS_CB(skb)->tun_key->ipv4_ttl, df, +			     src_port, dst_port, +			     htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8), +			     false); +	if (err < 0) +		ip_rt_put(rt); +error: +	return err; +} + +static const char *vxlan_get_name(const struct vport *vport) +{ +	struct vxlan_port *vxlan_port = vxlan_vport(vport); +	return vxlan_port->name; +} + +const struct vport_ops ovs_vxlan_vport_ops = { +	.type		= OVS_VPORT_TYPE_VXLAN, +	.create		= vxlan_tnl_create, +	.destroy	= vxlan_tnl_destroy, +	.get_name	= vxlan_get_name, +	.get_options	= vxlan_get_options, +	.send		= vxlan_tnl_send, +}; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c new file mode 100644 index 00000000000..42c0f4a0b78 --- /dev/null +++ b/net/openvswitch/vport.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/etherdevice.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <linux/jhash.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> +#include <linux/compat.h> +#include <net/net_namespace.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-internal_dev.h" + +static void ovs_vport_record_error(struct vport *, +				   enum vport_err_type err_type); + +/* List of statically compiled vport implementations.  Don't forget to also + * add yours to the list at the bottom of vport.h. */ +static const struct vport_ops *vport_ops_list[] = { +	&ovs_netdev_vport_ops, +	&ovs_internal_vport_ops, + +#ifdef CONFIG_OPENVSWITCH_GRE +	&ovs_gre_vport_ops, +#endif +#ifdef CONFIG_OPENVSWITCH_VXLAN +	&ovs_vxlan_vport_ops, +#endif +}; + +/* Protected by RCU read lock for reading, ovs_mutex for writing. */ +static struct hlist_head *dev_table; +#define VPORT_HASH_BUCKETS 1024 + +/** + *	ovs_vport_init - initialize vport subsystem + * + * Called at module load time to initialize the vport subsystem. + */ +int ovs_vport_init(void) +{ +	dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), +			    GFP_KERNEL); +	if (!dev_table) +		return -ENOMEM; + +	return 0; +} + +/** + *	ovs_vport_exit - shutdown vport subsystem + * + * Called at module exit time to shutdown the vport subsystem. + */ +void ovs_vport_exit(void) +{ +	kfree(dev_table); +} + +static struct hlist_head *hash_bucket(struct net *net, const char *name) +{ +	unsigned int hash = jhash(name, strlen(name), (unsigned long) net); +	return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; +} + +/** + *	ovs_vport_locate - find a port that has already been created + * + * @name: name of port to find + * + * Must be called with ovs or RCU read lock. + */ +struct vport *ovs_vport_locate(struct net *net, const char *name) +{ +	struct hlist_head *bucket = hash_bucket(net, name); +	struct vport *vport; + +	hlist_for_each_entry_rcu(vport, bucket, hash_node) +		if (!strcmp(name, vport->ops->get_name(vport)) && +		    net_eq(ovs_dp_get_net(vport->dp), net)) +			return vport; + +	return NULL; +} + +/** + *	ovs_vport_alloc - allocate and initialize new vport + * + * @priv_size: Size of private data area to allocate. + * @ops: vport device ops + * + * Allocate and initialize a new vport defined by @ops.  The vport will contain + * a private data area of size @priv_size that can be accessed using + * vport_priv().  vports that are no longer needed should be released with + * vport_free(). + */ +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, +			  const struct vport_parms *parms) +{ +	struct vport *vport; +	size_t alloc_size; + +	alloc_size = sizeof(struct vport); +	if (priv_size) { +		alloc_size = ALIGN(alloc_size, VPORT_ALIGN); +		alloc_size += priv_size; +	} + +	vport = kzalloc(alloc_size, GFP_KERNEL); +	if (!vport) +		return ERR_PTR(-ENOMEM); + +	vport->dp = parms->dp; +	vport->port_no = parms->port_no; +	vport->upcall_portid = parms->upcall_portid; +	vport->ops = ops; +	INIT_HLIST_NODE(&vport->dp_hash_node); + +	vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); +	if (!vport->percpu_stats) { +		kfree(vport); +		return ERR_PTR(-ENOMEM); +	} + +	spin_lock_init(&vport->stats_lock); + +	return vport; +} + +/** + *	ovs_vport_free - uninitialize and free vport + * + * @vport: vport to free + * + * Frees a vport allocated with vport_alloc() when it is no longer needed. + * + * The caller must ensure that an RCU grace period has passed since the last + * time @vport was in a datapath. + */ +void ovs_vport_free(struct vport *vport) +{ +	free_percpu(vport->percpu_stats); +	kfree(vport); +} + +/** + *	ovs_vport_add - add vport device (for kernel callers) + * + * @parms: Information about new vport. + * + * Creates a new vport with the specified configuration (which is dependent on + * device type).  ovs_mutex must be held. + */ +struct vport *ovs_vport_add(const struct vport_parms *parms) +{ +	struct vport *vport; +	int err = 0; +	int i; + +	for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { +		if (vport_ops_list[i]->type == parms->type) { +			struct hlist_head *bucket; + +			vport = vport_ops_list[i]->create(parms); +			if (IS_ERR(vport)) { +				err = PTR_ERR(vport); +				goto out; +			} + +			bucket = hash_bucket(ovs_dp_get_net(vport->dp), +					     vport->ops->get_name(vport)); +			hlist_add_head_rcu(&vport->hash_node, bucket); +			return vport; +		} +	} + +	err = -EAFNOSUPPORT; + +out: +	return ERR_PTR(err); +} + +/** + *	ovs_vport_set_options - modify existing vport device (for kernel callers) + * + * @vport: vport to modify. + * @options: New configuration. + * + * Modifies an existing device with the specified configuration (which is + * dependent on device type).  ovs_mutex must be held. + */ +int ovs_vport_set_options(struct vport *vport, struct nlattr *options) +{ +	if (!vport->ops->set_options) +		return -EOPNOTSUPP; +	return vport->ops->set_options(vport, options); +} + +/** + *	ovs_vport_del - delete existing vport device + * + * @vport: vport to delete. + * + * Detaches @vport from its datapath and destroys it.  It is possible to fail + * for reasons such as lack of memory.  ovs_mutex must be held. + */ +void ovs_vport_del(struct vport *vport) +{ +	ASSERT_OVSL(); + +	hlist_del_rcu(&vport->hash_node); + +	vport->ops->destroy(vport); +} + +/** + *	ovs_vport_get_stats - retrieve device stats + * + * @vport: vport from which to retrieve the stats + * @stats: location to store stats + * + * Retrieves transmit, receive, and error stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) +{ +	int i; + +	memset(stats, 0, sizeof(*stats)); + +	/* We potentially have 2 sources of stats that need to be combined: +	 * those we have collected (split into err_stats and percpu_stats) from +	 * set_stats() and device error stats from netdev->get_stats() (for +	 * errors that happen  downstream and therefore aren't reported through +	 * our vport_record_error() function). +	 * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). +	 * netdev-stats can be directly read over netlink-ioctl. +	 */ + +	spin_lock_bh(&vport->stats_lock); + +	stats->rx_errors	= vport->err_stats.rx_errors; +	stats->tx_errors	= vport->err_stats.tx_errors; +	stats->tx_dropped	= vport->err_stats.tx_dropped; +	stats->rx_dropped	= vport->err_stats.rx_dropped; + +	spin_unlock_bh(&vport->stats_lock); + +	for_each_possible_cpu(i) { +		const struct pcpu_sw_netstats *percpu_stats; +		struct pcpu_sw_netstats local_stats; +		unsigned int start; + +		percpu_stats = per_cpu_ptr(vport->percpu_stats, i); + +		do { +			start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); +			local_stats = *percpu_stats; +		} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + +		stats->rx_bytes		+= local_stats.rx_bytes; +		stats->rx_packets	+= local_stats.rx_packets; +		stats->tx_bytes		+= local_stats.tx_bytes; +		stats->tx_packets	+= local_stats.tx_packets; +	} +} + +/** + *	ovs_vport_get_options - retrieve device options + * + * @vport: vport from which to retrieve the options. + * @skb: sk_buff where options should be appended. + * + * Retrieves the configuration of the given device, appending an + * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested + * vport-specific attributes to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another + * negative error code if a real error occurred.  If an error occurs, @skb is + * left unmodified. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) +{ +	struct nlattr *nla; +	int err; + +	if (!vport->ops->get_options) +		return 0; + +	nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); +	if (!nla) +		return -EMSGSIZE; + +	err = vport->ops->get_options(vport, skb); +	if (err) { +		nla_nest_cancel(skb, nla); +		return err; +	} + +	nla_nest_end(skb, nla); +	return 0; +} + +/** + *	ovs_vport_receive - pass up received packet to the datapath for processing + * + * @vport: vport that received the packet + * @skb: skb that was received + * @tun_key: tunnel (if any) that carried packet + * + * Must be called with rcu_read_lock.  The packet cannot be shared and + * skb->data should point to the Ethernet header. + */ +void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, +		       struct ovs_key_ipv4_tunnel *tun_key) +{ +	struct pcpu_sw_netstats *stats; + +	stats = this_cpu_ptr(vport->percpu_stats); +	u64_stats_update_begin(&stats->syncp); +	stats->rx_packets++; +	stats->rx_bytes += skb->len; +	u64_stats_update_end(&stats->syncp); + +	OVS_CB(skb)->tun_key = tun_key; +	ovs_dp_process_received_packet(vport, skb); +} + +/** + *	ovs_vport_send - send a packet on a device + * + * @vport: vport on which to send the packet + * @skb: skb to send + * + * Sends the given packet and returns the length of data sent.  Either ovs + * lock or rcu_read_lock must be held. + */ +int ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ +	int sent = vport->ops->send(vport, skb); + +	if (likely(sent > 0)) { +		struct pcpu_sw_netstats *stats; + +		stats = this_cpu_ptr(vport->percpu_stats); + +		u64_stats_update_begin(&stats->syncp); +		stats->tx_packets++; +		stats->tx_bytes += sent; +		u64_stats_update_end(&stats->syncp); +	} else if (sent < 0) { +		ovs_vport_record_error(vport, VPORT_E_TX_ERROR); +		kfree_skb(skb); +	} else +		ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); + +	return sent; +} + +/** + *	ovs_vport_record_error - indicate device error to generic stats layer + * + * @vport: vport that encountered the error + * @err_type: one of enum vport_err_type types to indicate the error type + * + * If using the vport generic stats layer indicate that an error of the given + * type has occurred. + */ +static void ovs_vport_record_error(struct vport *vport, +				   enum vport_err_type err_type) +{ +	spin_lock(&vport->stats_lock); + +	switch (err_type) { +	case VPORT_E_RX_DROPPED: +		vport->err_stats.rx_dropped++; +		break; + +	case VPORT_E_RX_ERROR: +		vport->err_stats.rx_errors++; +		break; + +	case VPORT_E_TX_DROPPED: +		vport->err_stats.tx_dropped++; +		break; + +	case VPORT_E_TX_ERROR: +		vport->err_stats.tx_errors++; +		break; +	} + +	spin_unlock(&vport->stats_lock); +} + +static void free_vport_rcu(struct rcu_head *rcu) +{ +	struct vport *vport = container_of(rcu, struct vport, rcu); + +	ovs_vport_free(vport); +} + +void ovs_vport_deferred_free(struct vport *vport) +{ +	if (!vport) +		return; + +	call_rcu(&vport->rcu, free_vport_rcu); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h new file mode 100644 index 00000000000..8d721e62f38 --- /dev/null +++ b/net/openvswitch/vport.h @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_H +#define VPORT_H 1 + +#include <linux/if_tunnel.h> +#include <linux/list.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/u64_stats_sync.h> + +#include "datapath.h" + +struct vport; +struct vport_parms; + +/* The following definitions are for users of the vport subsytem: */ + +/* The following definitions are for users of the vport subsytem: */ +struct vport_net { +	struct vport __rcu *gre_vport; +}; + +int ovs_vport_init(void); +void ovs_vport_exit(void); + +struct vport *ovs_vport_add(const struct vport_parms *); +void ovs_vport_del(struct vport *); + +struct vport *ovs_vport_locate(struct net *net, const char *name); + +void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); + +int ovs_vport_set_options(struct vport *, struct nlattr *options); +int ovs_vport_get_options(const struct vport *, struct sk_buff *); + +int ovs_vport_send(struct vport *, struct sk_buff *); + +/* The following definitions are for implementers of vport devices: */ + +struct vport_err_stats { +	u64 rx_dropped; +	u64 rx_errors; +	u64 tx_dropped; +	u64 tx_errors; +}; + +/** + * struct vport - one port within a datapath + * @rcu: RCU callback head for deferred destruction. + * @dp: Datapath to which this port belongs. + * @upcall_portid: The Netlink port to use for packets received on this port that + * miss the flow table. + * @port_no: Index into @dp's @ports array. + * @hash_node: Element in @dev_table hash table in vport.c. + * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. + * @ops: Class structure. + * @percpu_stats: Points to per-CPU statistics used and maintained by vport + * @stats_lock: Protects @err_stats; + * @err_stats: Points to error statistics used and maintained by vport + */ +struct vport { +	struct rcu_head rcu; +	struct datapath	*dp; +	u32 upcall_portid; +	u16 port_no; + +	struct hlist_node hash_node; +	struct hlist_node dp_hash_node; +	const struct vport_ops *ops; + +	struct pcpu_sw_netstats __percpu *percpu_stats; + +	spinlock_t stats_lock; +	struct vport_err_stats err_stats; +}; + +/** + * struct vport_parms - parameters for creating a new vport + * + * @name: New vport's name. + * @type: New vport's type. + * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if + * none was supplied. + * @dp: New vport's datapath. + * @port_no: New vport's port number. + */ +struct vport_parms { +	const char *name; +	enum ovs_vport_type type; +	struct nlattr *options; + +	/* For ovs_vport_alloc(). */ +	struct datapath *dp; +	u16 port_no; +	u32 upcall_portid; +}; + +/** + * struct vport_ops - definition of a type of virtual port + * + * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. + * @create: Create a new vport configured as specified.  On success returns + * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. + * @destroy: Destroys a vport.  Must call vport_free() on the vport but not + * before an RCU grace period has elapsed. + * @set_options: Modify the configuration of an existing vport.  May be %NULL + * if modification is not supported. + * @get_options: Appends vport-specific attributes for the configuration of an + * existing vport to a &struct sk_buff.  May be %NULL for a vport that does not + * have any configuration. + * @get_name: Get the device's name. + * @send: Send a packet on the device.  Returns the length of the packet sent, + * zero for dropped packets or negative for error. + */ +struct vport_ops { +	enum ovs_vport_type type; + +	/* Called with ovs_mutex. */ +	struct vport *(*create)(const struct vport_parms *); +	void (*destroy)(struct vport *); + +	int (*set_options)(struct vport *, struct nlattr *); +	int (*get_options)(const struct vport *, struct sk_buff *); + +	/* Called with rcu_read_lock or ovs_mutex. */ +	const char *(*get_name)(const struct vport *); + +	int (*send)(struct vport *, struct sk_buff *); +}; + +enum vport_err_type { +	VPORT_E_RX_DROPPED, +	VPORT_E_RX_ERROR, +	VPORT_E_TX_DROPPED, +	VPORT_E_TX_ERROR, +}; + +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, +			      const struct vport_parms *); +void ovs_vport_free(struct vport *); +void ovs_vport_deferred_free(struct vport *vport); + +#define VPORT_ALIGN 8 + +/** + *	vport_priv - access private data area of vport + * + * @vport: vport to access + * + * If a nonzero size was passed in priv_size of vport_alloc() a private data + * area was allocated on creation.  This allows that area to be accessed and + * used for any purpose needed by the vport implementer. + */ +static inline void *vport_priv(const struct vport *vport) +{ +	return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); +} + +/** + *	vport_from_priv - lookup vport from private data pointer + * + * @priv: Start of private data area. + * + * It is sometimes useful to translate from a pointer to the private data + * area to the vport, such as in the case where the private data pointer is + * the result of a hash table lookup.  @priv must point to the start of the + * private data area. + */ +static inline struct vport *vport_from_priv(void *priv) +{ +	return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); +} + +void ovs_vport_receive(struct vport *, struct sk_buff *, +		       struct ovs_key_ipv4_tunnel *); + +/* List of statically compiled vport implementations.  Don't forget to also + * add yours to the list at the top of vport.c. */ +extern const struct vport_ops ovs_netdev_vport_ops; +extern const struct vport_ops ovs_internal_vport_ops; +extern const struct vport_ops ovs_gre_vport_ops; +extern const struct vport_ops ovs_vxlan_vport_ops; + +static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, +				      const void *start, unsigned int len) +{ +	if (skb->ip_summed == CHECKSUM_COMPLETE) +		skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); +} + +#endif /* vport.h */  | 
