diff options
-rw-r--r-- | Documentation/networking/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/networking/openvswitch.txt | 195 | ||||
-rw-r--r-- | MAINTAINERS | 8 | ||||
-rw-r--r-- | include/linux/openvswitch.h | 452 | ||||
-rw-r--r-- | net/Kconfig | 1 | ||||
-rw-r--r-- | net/Makefile | 1 | ||||
-rw-r--r-- | net/openvswitch/Kconfig | 28 | ||||
-rw-r--r-- | net/openvswitch/Makefile | 14 | ||||
-rw-r--r-- | net/openvswitch/actions.c | 415 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 1912 | ||||
-rw-r--r-- | net/openvswitch/datapath.h | 125 | ||||
-rw-r--r-- | net/openvswitch/dp_notify.c | 66 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 1346 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 199 | ||||
-rw-r--r-- | net/openvswitch/vport-internal_dev.c | 241 | ||||
-rw-r--r-- | net/openvswitch/vport-internal_dev.h | 28 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.c | 198 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.h | 42 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 396 | ||||
-rw-r--r-- | net/openvswitch/vport.h | 205 |
20 files changed, 5874 insertions, 0 deletions
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index bbce1215434..9ad9ddeb384 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -144,6 +144,8 @@ nfc.txt - The Linux Near Field Communication (NFS) subsystem. olympic.txt - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info. +openvswitch.txt + - Open vSwitch developer documentation. operstates.txt - Overview of network interface operational states. packet_mmap.txt diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt new file mode 100644 index 00000000000..b8a048b8df3 --- /dev/null +++ b/Documentation/networking/openvswitch.txt @@ -0,0 +1,195 @@ +Open vSwitch datapath developer documentation +============================================= + +The Open vSwitch kernel module allows flexible userspace control over +flow-level packet processing on selected network devices. It can be +used to implement a plain Ethernet switch, network device bonding, +VLAN processing, network access control, flow-based network control, +and so on. + +The kernel module implements multiple "datapaths" (analogous to +bridges), each of which can have multiple "vports" (analogous to ports +within a bridge). Each datapath also has associated with it a "flow +table" that userspace populates with "flows" that map from keys based +on packet headers and metadata to sets of actions. The most common +action forwards the packet to another vport; other actions are also +implemented. + +When a packet arrives on a vport, the kernel module processes it by +extracting its flow key and looking it up in the flow table. If there +is a matching flow, it executes the associated actions. If there is +no match, it queues the packet to userspace for processing (as part of +its processing, userspace will likely set up a flow to handle further +packets of the same type entirely in-kernel). + + +Flow key compatibility +---------------------- + +Network protocols evolve over time. New protocols become important +and existing protocols lose their prominence. For the Open vSwitch +kernel module to remain relevant, it must be possible for newer +versions to parse additional protocols as part of the flow key. It +might even be desirable, someday, to drop support for parsing +protocols that have become obsolete. Therefore, the Netlink interface +to Open vSwitch is designed to allow carefully written userspace +applications to work with any version of the flow key, past or future. + +To support this forward and backward compatibility, whenever the +kernel module passes a packet to userspace, it also passes along the +flow key that it parsed from the packet. Userspace then extracts its +own notion of a flow key from the packet and compares it against the +kernel-provided version: + + - If userspace's notion of the flow key for the packet matches the + kernel's, then nothing special is necessary. + + - If the kernel's flow key includes more fields than the userspace + version of the flow key, for example if the kernel decoded IPv6 + headers but userspace stopped at the Ethernet type (because it + does not understand IPv6), then again nothing special is + necessary. Userspace can still set up a flow in the usual way, + as long as it uses the kernel-provided flow key to do it. + + - If the userspace flow key includes more fields than the + kernel's, for example if userspace decoded an IPv6 header but + the kernel stopped at the Ethernet type, then userspace can + forward the packet manually, without setting up a flow in the + kernel. This case is bad for performance because every packet + that the kernel considers part of the flow must go to userspace, + but the forwarding behavior is correct. (If userspace can + determine that the values of the extra fields would not affect + forwarding behavior, then it could set up a flow anyway.) + +How flow keys evolve over time is important to making this work, so +the following sections go into detail. + + +Flow key format +--------------- + +A flow key is passed over a Netlink socket as a sequence of Netlink +attributes. Some attributes represent packet metadata, defined as any +information about a packet that cannot be extracted from the packet +itself, e.g. the vport on which the packet was received. Most +attributes, however, are extracted from headers within the packet, +e.g. source and destination addresses from Ethernet, IP, or TCP +headers. + +The <linux/openvswitch.h> header file defines the exact format of the +flow key attributes. For informal explanatory purposes here, we write +them as comma-separated strings, with parentheses indicating arguments +and nesting. For example, the following could represent a flow key +corresponding to a TCP packet that arrived on vport 1: + + in_port(1), eth(src=e0:91:f5:21:d0:b2, dst=00:02:e3:0f:80:a4), + eth_type(0x0800), ipv4(src=172.16.0.20, dst=172.18.0.52, proto=17, tos=0, + frag=no), tcp(src=49163, dst=80) + +Often we ellipsize arguments not important to the discussion, e.g.: + + in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...) + + +Basic rule for evolving flow keys +--------------------------------- + +Some care is needed to really maintain forward and backward +compatibility for applications that follow the rules listed under +"Flow key compatibility" above. + +The basic rule is obvious: + + ------------------------------------------------------------------ + New network protocol support must only supplement existing flow + key attributes. It must not change the meaning of already defined + flow key attributes. + ------------------------------------------------------------------ + +This rule does have less-obvious consequences so it is worth working +through a few examples. Suppose, for example, that the kernel module +did not already implement VLAN parsing. Instead, it just interpreted +the 802.1Q TPID (0x8100) as the Ethertype then stopped parsing the +packet. The flow key for any packet with an 802.1Q header would look +essentially like this, ignoring metadata: + + eth(...), eth_type(0x8100) + +Naively, to add VLAN support, it makes sense to add a new "vlan" flow +key attribute to contain the VLAN tag, then continue to decode the +encapsulated headers beyond the VLAN tag using the existing field +definitions. With this change, an TCP packet in VLAN 10 would have a +flow key much like this: + + eth(...), vlan(vid=10, pcp=0), eth_type(0x0800), ip(proto=6, ...), tcp(...) + +But this change would negatively affect a userspace application that +has not been updated to understand the new "vlan" flow key attribute. +The application could, following the flow compatibility rules above, +ignore the "vlan" attribute that it does not understand and therefore +assume that the flow contained IP packets. This is a bad assumption +(the flow only contains IP packets if one parses and skips over the +802.1Q header) and it could cause the application's behavior to change +across kernel versions even though it follows the compatibility rules. + +The solution is to use a set of nested attributes. This is, for +example, why 802.1Q support uses nested attributes. A TCP packet in +VLAN 10 is actually expressed as: + + eth(...), eth_type(0x8100), vlan(vid=10, pcp=0), encap(eth_type(0x0800), + ip(proto=6, ...), tcp(...))) + +Notice how the "eth_type", "ip", and "tcp" flow key attributes are +nested inside the "encap" attribute. Thus, an application that does +not understand the "vlan" key will not see either of those attributes +and therefore will not misinterpret them. (Also, the outer eth_type +is still 0x8100, not changed to 0x0800.) + +Handling malformed packets +-------------------------- + +Don't drop packets in the kernel for malformed protocol headers, bad +checksums, etc. This would prevent userspace from implementing a +simple Ethernet switch that forwards every packet. + +Instead, in such a case, include an attribute with "empty" content. +It doesn't matter if the empty content could be valid protocol values, +as long as those values are rarely seen in practice, because userspace +can always forward all packets with those values to userspace and +handle them individually. + +For example, consider a packet that contains an IP header that +indicates protocol 6 for TCP, but which is truncated just after the IP +header, so that the TCP header is missing. The flow key for this +packet would include a tcp attribute with all-zero src and dst, like +this: + + eth(...), eth_type(0x0800), ip(proto=6, ...), tcp(src=0, dst=0) + +As another example, consider a packet with an Ethernet type of 0x8100, +indicating that a VLAN TCI should follow, but which is truncated just +after the Ethernet type. The flow key for this packet would include +an all-zero-bits vlan and an empty encap attribute, like this: + + eth(...), eth_type(0x8100), vlan(0), encap() + +Unlike a TCP packet with source and destination ports 0, an +all-zero-bits VLAN TCI is not that rare, so the CFI bit (aka +VLAN_TAG_PRESENT inside the kernel) is ordinarily set in a vlan +attribute expressly to allow this situation to be distinguished. +Thus, the flow key in this second example unambiguously indicates a +missing or malformed VLAN TCI. + +Other rules +----------- + +The other rules for flow keys are much less subtle: + + - Duplicate attributes are not allowed at a given nesting level. + + - Ordering of attributes is not significant. + + - When the kernel sends a given flow key to userspace, it always + composes it the same way. This allows userspace to hash and + compare entire flow keys that it may not be able to fully + interpret. diff --git a/MAINTAINERS b/MAINTAINERS index c88eb7bb3a6..209ad0695ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4868,6 +4868,14 @@ S: Maintained T: git git://openrisc.net/~jonas/linux F: arch/openrisc +OPENVSWITCH +M: Jesse Gross <jesse@nicira.com> +L: dev@openvswitch.org +W: http://openvswitch.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jesse/openvswitch.git +S: Maintained +F: net/openvswitch/ + OPL4 DRIVER M: Clemens Ladisch <clemens@ladisch.de> L: alsa-devel@alsa-project.org (moderated for non-subscribers) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h new file mode 100644 index 00000000000..eb1efa54fe8 --- /dev/null +++ b/include/linux/openvswitch.h @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef _LINUX_OPENVSWITCH_H +#define _LINUX_OPENVSWITCH_H 1 + +#include <linux/types.h> + +/** + * struct ovs_header - header for OVS Generic Netlink messages. + * @dp_ifindex: ifindex of local port for datapath (0 to make a request not + * specific to a datapath). + * + * Attributes following the header are specific to a particular OVS Generic + * Netlink family, but all of the OVS families use this header. + */ + +struct ovs_header { + int dp_ifindex; +}; + +/* Datapaths. */ + +#define OVS_DATAPATH_FAMILY "ovs_datapath" +#define OVS_DATAPATH_MCGROUP "ovs_datapath" +#define OVS_DATAPATH_VERSION 0x1 + +enum ovs_datapath_cmd { + OVS_DP_CMD_UNSPEC, + OVS_DP_CMD_NEW, + OVS_DP_CMD_DEL, + OVS_DP_CMD_GET, + OVS_DP_CMD_SET +}; + +/** + * enum ovs_datapath_attr - attributes for %OVS_DP_* commands. + * @OVS_DP_ATTR_NAME: Name of the network device that serves as the "local + * port". This is the name of the network device whose dp_ifindex is given in + * the &struct ovs_header. Always present in notifications. Required in + * %OVS_DP_NEW requests. May be used as an alternative to specifying + * dp_ifindex in other requests (with a dp_ifindex of 0). + * @OVS_DP_ATTR_UPCALL_PID: The Netlink socket in userspace that is initially + * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on + * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should + * not be sent. + * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the + * datapath. Always present in notifications. + * + * These attributes follow the &struct ovs_header within the Generic Netlink + * payload for %OVS_DP_* commands. + */ +enum ovs_datapath_attr { + OVS_DP_ATTR_UNSPEC, + OVS_DP_ATTR_NAME, /* name of dp_ifindex netdev */ + OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */ + OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */ + __OVS_DP_ATTR_MAX +}; + +#define OVS_DP_ATTR_MAX (__OVS_DP_ATTR_MAX - 1) + +struct ovs_dp_stats { + __u64 n_hit; /* Number of flow table matches. */ + __u64 n_missed; /* Number of flow table misses. */ + __u64 n_lost; /* Number of misses not sent to userspace. */ + __u64 n_flows; /* Number of flows present */ +}; + +struct ovs_vport_stats { + __u64 rx_packets; /* total packets received */ + __u64 tx_packets; /* total packets transmitted */ + __u64 rx_bytes; /* total bytes received */ + __u64 tx_bytes; /* total bytes transmitted */ + __u64 rx_errors; /* bad packets received */ + __u64 tx_errors; /* packet transmit problems */ + __u64 rx_dropped; /* no space in linux buffers */ + __u64 tx_dropped; /* no space available in linux */ +}; + +/* Fixed logical ports. */ +#define OVSP_LOCAL ((__u16)0) + +/* Packet transfer. */ + +#define OVS_PACKET_FAMILY "ovs_packet" +#define OVS_PACKET_VERSION 0x1 + +enum ovs_packet_cmd { + OVS_PACKET_CMD_UNSPEC, + + /* Kernel-to-user notifications. */ + OVS_PACKET_CMD_MISS, /* Flow table miss. */ + OVS_PACKET_CMD_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */ + + /* Userspace commands. */ + OVS_PACKET_CMD_EXECUTE /* Apply actions to a packet. */ +}; + +/** + * enum ovs_packet_attr - attributes for %OVS_PACKET_* commands. + * @OVS_PACKET_ATTR_PACKET: Present for all notifications. Contains the entire + * packet as received, from the start of the Ethernet header onward. For + * %OVS_PACKET_CMD_ACTION, %OVS_PACKET_ATTR_PACKET reflects changes made by + * actions preceding %OVS_ACTION_ATTR_USERSPACE, but %OVS_PACKET_ATTR_KEY is + * the flow key extracted from the packet as originally received. + * @OVS_PACKET_ATTR_KEY: Present for all notifications. Contains the flow key + * extracted from the packet as nested %OVS_KEY_ATTR_* attributes. This allows + * userspace to adapt its flow setup strategy by comparing its notion of the + * flow key against the kernel's. + * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet. Used + * for %OVS_PACKET_CMD_EXECUTE. It has nested %OVS_ACTION_ATTR_* attributes. + * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION + * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an + * %OVS_USERSPACE_ATTR_USERDATA attribute. + * + * These attributes follow the &struct ovs_header within the Generic Netlink + * payload for %OVS_PACKET_* commands. + */ +enum ovs_packet_attr { + OVS_PACKET_ATTR_UNSPEC, + OVS_PACKET_ATTR_PACKET, /* Packet data. */ + OVS_PACKET_ATTR_KEY, /* Nested OVS_KEY_ATTR_* attributes. */ + OVS_PACKET_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ + OVS_PACKET_ATTR_USERDATA, /* u64 OVS_ACTION_ATTR_USERSPACE arg. */ + __OVS_PACKET_ATTR_MAX +}; + +#define OVS_PACKET_ATTR_MAX (__OVS_PACKET_ATTR_MAX - 1) + +/* Virtual ports. */ + +#define OVS_VPORT_FAMILY "ovs_vport" +#define OVS_VPORT_MCGROUP "ovs_vport" +#define OVS_VPORT_VERSION 0x1 + +enum ovs_vport_cmd { + OVS_VPORT_CMD_UNSPEC, + OVS_VPORT_CMD_NEW, + OVS_VPORT_CMD_DEL, + OVS_VPORT_CMD_GET, + OVS_VPORT_CMD_SET +}; + +enum ovs_vport_type { + OVS_VPORT_TYPE_UNSPEC, + OVS_VPORT_TYPE_NETDEV, /* network device */ + OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ + __OVS_VPORT_TYPE_MAX +}; + +#define OVS_VPORT_TYPE_MAX (__OVS_VPORT_TYPE_MAX - 1) + +/** + * enum ovs_vport_attr - attributes for %OVS_VPORT_* commands. + * @OVS_VPORT_ATTR_PORT_NO: 32-bit port number within datapath. + * @OVS_VPORT_ATTR_TYPE: 32-bit %OVS_VPORT_TYPE_* constant describing the type + * of vport. + * @OVS_VPORT_ATTR_NAME: Name of vport. For a vport based on a network device + * this is the name of the network device. Maximum length %IFNAMSIZ-1 bytes + * plus a null terminator. + * @OVS_VPORT_ATTR_OPTIONS: Vport-specific configuration information. + * @OVS_VPORT_ATTR_UPCALL_PID: The Netlink socket in userspace that + * OVS_PACKET_CMD_MISS upcalls will be directed to for packets received on + * this port. A value of zero indicates that upcalls should not be sent. + * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for + * packets sent or received through the vport. + * + * These attributes follow the &struct ovs_header within the Generic Netlink + * payload for %OVS_VPORT_* commands. + * + * For %OVS_VPORT_CMD_NEW requests, the %OVS_VPORT_ATTR_TYPE and + * %OVS_VPORT_ATTR_NAME attributes are required. %OVS_VPORT_ATTR_PORT_NO is + * optional; if not specified a free port number is automatically selected. + * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type + * of vport. + * and other attributes are ignored. + * + * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to + * look up the vport to operate on; otherwise dp_idx from the &struct + * ovs_header plus %OVS_VPORT_ATTR_PORT_NO determine the vport. + */ +enum ovs_vport_attr { + OVS_VPORT_ATTR_UNSPEC, + OVS_VPORT_ATTR_PORT_NO, /* u32 port number within datapath */ + OVS_VPORT_ATTR_TYPE, /* u32 OVS_VPORT_TYPE_* constant. */ + OVS_VPORT_ATTR_NAME, /* string name, up to IFNAMSIZ bytes long */ + OVS_VPORT_ATTR_OPTIONS, /* nested attributes, varies by vport type */ + OVS_VPORT_ATTR_UPCALL_PID, /* u32 Netlink PID to receive upcalls */ + OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */ + __OVS_VPORT_ATTR_MAX +}; + +#define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) + +/* Flows. */ + +#define OVS_FLOW_FAMILY "ovs_flow" +#define OVS_FLOW_MCGROUP "ovs_flow" +#define OVS_FLOW_VERSION 0x1 + +enum ovs_flow_cmd { + OVS_FLOW_CMD_UNSPEC, + OVS_FLOW_CMD_NEW, + OVS_FLOW_CMD_DEL, + OVS_FLOW_CMD_GET, + OVS_FLOW_CMD_SET +}; + +struct ovs_flow_stats { + __u64 n_packets; /* Number of matched packets. */ + __u64 n_bytes; /* Number of matched bytes. */ +}; + +enum ovs_key_attr { + OVS_KEY_ATTR_UNSPEC, + OVS_KEY_ATTR_ENCAP, /* Nested set of encapsulated attributes. */ + OVS_KEY_ATTR_PRIORITY, /* u32 skb->priority */ + OVS_KEY_ATTR_IN_PORT, /* u32 OVS dp port number */ + OVS_KEY_ATTR_ETHERNET, /* struct ovs_key_ethernet */ + OVS_KEY_ATTR_VLAN, /* be16 VLAN TCI */ + OVS_KEY_ATTR_ETHERTYPE, /* be16 Ethernet type */ + OVS_KEY_ATTR_IPV4, /* struct ovs_key_ipv4 */ + OVS_KEY_ATTR_IPV6, /* struct ovs_key_ipv6 */ + OVS_KEY_ATTR_TCP, /* struct ovs_key_tcp */ + OVS_KEY_ATTR_UDP, /* struct ovs_key_udp */ + OVS_KEY_ATTR_ICMP, /* struct ovs_key_icmp */ + OVS_KEY_ATTR_ICMPV6, /* struct ovs_key_icmpv6 */ + OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */ + OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ + __OVS_KEY_ATTR_MAX +}; + +#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1) + +/** + * enum ovs_frag_type - IPv4 and IPv6 fragment type + * @OVS_FRAG_TYPE_NONE: Packet is not a fragment. + * @OVS_FRAG_TYPE_FIRST: Packet is a fragment with offset 0. + * @OVS_FRAG_TYPE_LATER: Packet is a fragment with nonzero offset. + * + * Used as the @ipv4_frag in &struct ovs_key_ipv4 and as @ipv6_frag &struct + * ovs_key_ipv6. + */ +enum ovs_frag_type { + OVS_FRAG_TYPE_NONE, + OVS_FRAG_TYPE_FIRST, + OVS_FRAG_TYPE_LATER, + __OVS_FRAG_TYPE_MAX +}; + +#define OVS_FRAG_TYPE_MAX (__OVS_FRAG_TYPE_MAX - 1) + +struct ovs_key_ethernet { + __u8 eth_src[6]; + __u8 eth_dst[6]; +}; + +struct ovs_key_ipv4 { + __be32 ipv4_src; + __be32 ipv4_dst; + __u8 ipv4_proto; + __u8 ipv4_tos; + __u8 ipv4_ttl; + __u8 ipv4_frag; /* One of OVS_FRAG_TYPE_*. */ +}; + +struct ovs_key_ipv6 { + __be32 ipv6_src[4]; + __be32 ipv6_dst[4]; + __be32 ipv6_label; /* 20-bits in least-significant bits. */ + __u8 ipv6_proto; + __u8 ipv6_tclass; + __u8 ipv6_hlimit; + __u8 ipv6_frag; /* One of OVS_FRAG_TYPE_*. */ +}; + +struct ovs_key_tcp { + __be16 tcp_src; + __be16 tcp_dst; +}; + +struct ovs_key_udp { + __be16 udp_src; + __be16 udp_dst; +}; + +struct ovs_key_icmp { + __u8 icmp_type; + __u8 icmp_code; +}; + +struct ovs_key_icmpv6 { + __u8 icmpv6_type; + __u8 icmpv6_code; +}; + +struct ovs_key_arp { + __be32 arp_sip; + __be32 arp_tip; + __be16 arp_op; + __u8 arp_sha[6]; + __u8 arp_tha[6]; +}; + +struct ovs_key_nd { + __u32 nd_target[4]; + __u8 nd_sll[6]; + __u8 nd_tll[6]; +}; + +/** + * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. + * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow + * key. Always present in notifications. Required for all requests (except + * dumps). + * @OVS_FLOW_ATTR_ACTIONS: Nested %OVS_ACTION_ATTR_* attributes specifying + * the actions to take for packets that match the key. Always present in + * notifications. Required for %OVS_FLOW_CMD_NEW requests, optional for + * %OVS_FLOW_CMD_SET requests. + * @OVS_FLOW_ATTR_STATS: &struct ovs_flow_stats giving statistics for this + * flow. Present in notifications if the stats would be nonzero. Ignored in + * requests. + * @OVS_FLOW_ATTR_TCP_FLAGS: An 8-bit value giving the OR'd value of all of the + * TCP flags seen on packets in this flow. Only present in notifications for + * TCP flows, and only if it would be nonzero. Ignored in requests. + * @OVS_FLOW_ATTR_USED: A 64-bit integer giving the time, in milliseconds on + * the system monotonic clock, at which a packet was last processed for this + * flow. Only present in notifications if a packet has been processed for this + * flow. Ignored in requests. + * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the + * last-used time, accumulated TCP flags, and statistics for this flow. + * Otherwise ignored in requests. Never present in notifications. + * + * These attributes follow the &struct ovs_header within the Generic Netlink + * payload for %OVS_FLOW_* commands. + */ +enum ovs_flow_attr { + OVS_FLOW_ATTR_UNSPEC, + OVS_FLOW_ATTR_KEY, /* Sequence of OVS_KEY_ATTR_* attributes. */ + OVS_FLOW_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ + OVS_FLOW_ATTR_STATS, /* struct ovs_flow_stats. */ + OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */ + OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */ + OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */ + __OVS_FLOW_ATTR_MAX +}; + +#define OVS_FLOW_ATTR_MAX (__OVS_FLOW_ATTR_MAX - 1) + +/** + * enum ovs_sample_attr - Attributes for %OVS_ACTION_ATTR_SAMPLE action. + * @OVS_SAMPLE_ATTR_PROBABILITY: 32-bit fraction of packets to sample with + * @OVS_ACTION_ATTR_SAMPLE. A value of 0 samples no packets, a value of + * %UINT32_MAX samples all packets and intermediate values sample intermediate + * fractions of packets. + * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event. + * Actions are passed as nested attributes. + * + * Executes the specified actions with the given probability on a per-packet + * basis. + */ +enum ovs_sample_attr { + OVS_SAMPLE_ATTR_UNSPEC, + OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */ + OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ + __OVS_SAMPLE_ATTR_MAX, +}; + +#define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1) + +/** + * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action. + * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION + * message should be sent. Required. + * @OVS_USERSPACE_ATTR_USERDATA: If present, its u64 argument is copied to the + * %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA, + */ +enum ovs_userspace_attr { + OVS_USERSPACE_ATTR_UNSPEC, + OVS_USERSPACE_ATTR_PID, /* u32 Netlink PID to receive upcalls. */ + OVS_USERSPACE_ATTR_USERDATA, /* u64 optional user-specified cookie. */ + __OVS_USERSPACE_ATTR_MAX +}; + +#define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1) + +/** + * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument. + * @vlan_tpid: Tag protocol identifier (TPID) to push. + * @vlan_tci: Tag control identifier (TCI) to push. The CFI bit must be set + * (but it will not be set in the 802.1Q header that is pushed). + * + * The @vlan_tpid value is typically %ETH_P_8021Q. The only acceptable TPID + * values are those that the kernel module also parses as 802.1Q headers, to + * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN + * from having surprising results. + */ +struct ovs_action_push_vlan { + __be16 vlan_tpid; /* 802.1Q TPID. */ + __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ +}; + +/** + * enum ovs_action_attr - Action types. + * + * @OVS_ACTION_ATTR_OUTPUT: Output packet to port. + * @OVS_ACTION_ATTR_USERSPACE: Send packet to userspace according to nested + * %OVS_USERSPACE_ATTR_* attributes. + * @OVS_ACTION_ATTR_SET: Replaces the contents of an existing header. The + * single nested %OVS_KEY_ATTR_* attribute specifies a header to modify and its + * value. + * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the + * packet. + * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet. + * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in + * the nested %OVS_SAMPLE_ATTR_* attributes. + * + * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all + * fields within a header are modifiable, e.g. the IPv4 protocol and fragment + * type may not be changed. + */ + +enum ovs_action_attr { + OVS_ACTION_ATTR_UNSPEC, + OVS_ACTION_ATTR_OUTPUT, /* u32 port number. */ + OVS_ACTION_ATTR_USERSPACE, /* Nested OVS_USERSPACE_ATTR_*. */ + OVS_ACTION_ATTR_SET, /* One nested OVS_KEY_ATTR_*. */ + OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */ + OVS_ACTION_ATTR_POP_VLAN, /* No argument. */ + OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ + __OVS_ACTION_ATTR_MAX +}; + +#define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1) + +#endif /* _LINUX_OPENVSWITCH_H */ diff --git a/net/Kconfig b/net/Kconfig index 2d998735c4d..e07272d0bb2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -215,6 +215,7 @@ source "net/sched/Kconfig" source "net/dcb/Kconfig" source "net/dns_resolver/Kconfig" source "net/batman-adv/Kconfig" +source "net/openvswitch/Kconfig" config RPS boolean diff --git a/net/Makefile b/net/Makefile index acdde4950de..ad432fa4d93 100644 --- a/net/Makefile +++ b/net/Makefile @@ -69,3 +69,4 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ +obj-$(CONFIG_OPENVSWITCH) += openvswitch/ diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig new file mode 100644 index 00000000000..d9ea33c361b --- /dev/null +++ b/net/openvswitch/Kconfig @@ -0,0 +1,28 @@ +# +# Open vSwitch +# + +config OPENVSWITCH + tristate "Open vSwitch" + ---help--- + Open vSwitch is a multilayer Ethernet switch targeted at virtualized + environments. In addition to supporting a variety of features + expected in a traditional hardware switch, it enables fine-grained + programmatic extension and flow-based control of the network. This + control is useful in a wide variety of applications but is + particularly important in multi-server virtualization deployments, + which are often characterized by highly dynamic endpoints and the + need to maintain logical abstractions for multiple tenants. + + The Open vSwitch datapath provides an in-kernel fast path for packet + forwarding. It is complemented by a userspace daemon, ovs-vswitchd, + which is able to accept configuration from a variety of sources and + translate it into packet processing rules. + + See http://openvswitch.org for more information and userspace + utilities. + + To compile this code as a module, choose M here: the module will be + called openvswitch. + + If unsure, say N. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile new file mode 100644 index 00000000000..15e7384745c --- /dev/null +++ b/net/openvswitch/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for Open vSwitch. +# + +obj-$(CONFIG_OPENVSWITCH) += openvswitch.o + +openvswitch-y := \ + actions.o \ + datapath.o \ + dp_notify.o \ + flow.o \ + vport.o \ + vport-internal_dev.o \ + vport-netdev.o \ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c new file mode 100644 index 00000000000..2725d1bdf29 --- /dev/null +++ b/net/openvswitch/actions.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/openvswitch.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/in6.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <net/dsfield.h> + +#include "datapath.h" +#include "vport.h" + +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr, int len, bool keep_skb); + +static int make_writable(struct sk_buff *skb, int write_len) +{ + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/* remove VLAN header from packet and update csum accrodingly. */ +static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) +{ + struct vlan_hdr *vhdr; + int err; + + err = make_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, csum_partial(skb->data + + ETH_HLEN, VLAN_HLEN, 0)); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *current_tci = vhdr->h_vlan_TCI; + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + skb_reset_mac_len(skb); + + return 0; +} + +static int pop_vlan(struct sk_buff *skb) +{ + __be16 tci; + int err; + + if (likely(vlan_tx_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely(skb->protocol != htons(ETH_P_8021Q) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __pop_vlan_tci(skb, &tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely(skb->protocol != htons(ETH_P_8021Q) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __pop_vlan_tci(skb, &tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, ntohs(tci)); + return 0; +} + +static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) +{ + if (unlikely(vlan_tx_tag_present(skb))) { + u16 current_tag; + + /* push down current VLAN tag */ + |